[llvm] [DAGCombine] Propagate truncate to operands (PR #98666)

Justin Fargnoli via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 19 19:25:54 PDT 2024


https://github.com/justinfargnoli updated https://github.com/llvm/llvm-project/pull/98666

>From 09689e8e34d778c51047133921415f7ecf413661 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 30 Jun 2024 18:50:30 -0700
Subject: [PATCH 01/47] Initial commit

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 54 ++++++++++++++++++++-
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 10 ++++
 2 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/combine-truncate.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb76ffdfd99d7b..17294d2fc1619e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -754,7 +754,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::VSELECT});
+                       ISD::TRUNCATE, ISD::VSELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5666,6 +5666,52 @@ static SDValue PerformREMCombine(SDNode *N,
   return SDValue();
 }
 
+// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
+// This will reduce register pressure. 
+static SDValue PerformTruncCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps()) {
+    SDValue LogicalOp = N->getOperand(0);
+    switch (LogicalOp.getOpcode()) {
+    default: 
+      break;
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      EVT VT = N->getValueType(0);
+      EVT LogicalVT = LogicalOp.getValueType();
+      if (VT == MVT::i32 && LogicalVT == MVT::i64) {
+        const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+        if (VT.isScalarInteger() ||
+            TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
+          if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+                return U->isMachineOpcode() ? 
+                       U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
+                       U->getOpcode() == ISD::TRUNCATE;
+              })) {
+            
+            SDLoc DL(N);
+            SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
+                                                     DL, MVT::i32);
+            SDNode *NarrowL = DCI.DAG.getMachineNode(
+                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
+            SDNode *NarrowR = DCI.DAG.getMachineNode(
+                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
+            return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+                                   SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+          }
+        }
+      }
+      break;
+    }
+    }
+  }
+  return SDValue();
+}
+
 enum OperandSignedness {
   Signed = 0,
   Unsigned,
@@ -6084,6 +6130,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::UREM:
     case ISD::SREM:
       return PerformREMCombine(N, DCI, OptLevel);
+    case ISD::TRUNCATE:
+      return PerformTruncCombine(N, DCI);
     case ISD::SETCC:
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
@@ -6101,6 +6149,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::VSELECT:
       return PerformVSELECTCombine(N, DCI);
   }
+
+  if (N->isMachineOpcode() && N->getMachineOpcode() == NVPTX::CVT_u32_u64)
+    return PerformTruncCombine(N, DCI);
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
new file mode 100644
index 00000000000000..f43887064acc0b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s | FileCheck %s
+
+define i32 @foo(i64 %a, i64 %b) {
+; CHECK: or.b32
+; CHECK-NOT: or.b64
+entry:
+  %or = or i64 %a, %b
+  %trunc = trunc i64 %or to i32
+  ret i32 %trunc
+}

>From 48c2b745dbc7f4ef641cd4c6131c2784c3400ce6 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:07:53 -0700
Subject: [PATCH 02/47] Prefer early return

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 71 +++++++++++----------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 17294d2fc1619e..23352f962842fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5670,45 +5670,46 @@ static SDValue PerformREMCombine(SDNode *N,
 // This will reduce register pressure. 
 static SDValue PerformTruncCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalizeOps()) {
-    SDValue LogicalOp = N->getOperand(0);
-    switch (LogicalOp.getOpcode()) {
-    default: 
-      break;
-    case ISD::ADD:
-    case ISD::SUB:
-    case ISD::MUL:
-    case ISD::AND:
-    case ISD::OR:
-    case ISD::XOR: {
-      EVT VT = N->getValueType(0);
-      EVT LogicalVT = LogicalOp.getValueType();
-      if (VT == MVT::i32 && LogicalVT == MVT::i64) {
-        const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-        if (VT.isScalarInteger() ||
-            TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
-          if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-                return U->isMachineOpcode() ? 
-                       U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
-                       U->getOpcode() == ISD::TRUNCATE;
-              })) {
-            
-            SDLoc DL(N);
-            SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
-                                                     DL, MVT::i32);
-            SDNode *NarrowL = DCI.DAG.getMachineNode(
-                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
-            SDNode *NarrowR = DCI.DAG.getMachineNode(
-                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
-            return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                                   SDValue(NarrowL, 0), SDValue(NarrowR, 0));
-          }
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LogicalOp = N->getOperand(0);
+  switch (LogicalOp.getOpcode()) {
+  default: 
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    EVT VT = N->getValueType(0);
+    EVT LogicalVT = LogicalOp.getValueType();
+    if (VT == MVT::i32 && LogicalVT == MVT::i64) {
+      const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+      if (VT.isScalarInteger() ||
+          TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
+        if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+              return U->isMachineOpcode() ? 
+                      U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
+                      U->getOpcode() == ISD::TRUNCATE;
+            })) {          
+          SDLoc DL(N);
+          SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
+                                                    DL, MVT::i32);
+          SDNode *NarrowL = DCI.DAG.getMachineNode(
+              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
+          SDNode *NarrowR = DCI.DAG.getMachineNode(
+              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
+          return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+                                  SDValue(NarrowL, 0), SDValue(NarrowR, 0));
         }
       }
-      break;
-    }
     }
+    break;
   }
+  }
+
   return SDValue();
 }
 

>From 0ba2746bf82867a7744a2db1d691fe4128974d70 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:08:15 -0700
Subject: [PATCH 03/47] clang-format

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 23352f962842fe..bb4e7f52bc13d8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5667,7 +5667,7 @@ static SDValue PerformREMCombine(SDNode *N,
 }
 
 // truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
-// This will reduce register pressure. 
+// This will reduce register pressure.
 static SDValue PerformTruncCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
   if (!DCI.isBeforeLegalizeOps())
@@ -5675,7 +5675,7 @@ static SDValue PerformTruncCombine(SDNode *N,
 
   SDValue LogicalOp = N->getOperand(0);
   switch (LogicalOp.getOpcode()) {
-  default: 
+  default:
     break;
   case ISD::ADD:
   case ISD::SUB:
@@ -5690,19 +5690,19 @@ static SDValue PerformTruncCombine(SDNode *N,
       if (VT.isScalarInteger() ||
           TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
         if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-              return U->isMachineOpcode() ? 
-                      U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
-                      U->getOpcode() == ISD::TRUNCATE;
-            })) {          
+              return U->isMachineOpcode()
+                         ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+                         : U->getOpcode() == ISD::TRUNCATE;
+            })) {
           SDLoc DL(N);
-          SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
-                                                    DL, MVT::i32);
+          SDValue None =
+              DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
           SDNode *NarrowL = DCI.DAG.getMachineNode(
               NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
           SDNode *NarrowR = DCI.DAG.getMachineNode(
               NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
           return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                                  SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+                                 SDValue(NarrowL, 0), SDValue(NarrowR, 0));
         }
       }
     }

>From a24805687273afd9be6bc315ccc21ae6acf45fea Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:17:55 -0700
Subject: [PATCH 04/47] Add negative test for `ISD::TRUNCATE`

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index f43887064acc0b..3d0827606dbc29 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s | FileCheck %s
 
-define i32 @foo(i64 %a, i64 %b) {
+define i32 @trunc(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc
 ; CHECK: or.b32
 ; CHECK-NOT: or.b64
 entry:
@@ -8,3 +9,14 @@ entry:
   %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }
+
+define i32 @trunc_not(i64 %a, i64 %b, ptr %p) {
+; CHECK-LABEL: trunc_not
+; CHECK: or.b64
+; CHECK-NOT: or.b32
+entry:
+  %or = or i64 %a, %b
+  %trunc = trunc i64 %or to i32
+  store i64 %or, ptr %p
+  ret i32 %trunc
+}

>From 6d6345b482fafc2f6503390f7f5c97105522e84c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:42:56 -0700
Subject: [PATCH 05/47] Add cvt and cvt_not test

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 90 ++++++++++++++++++---
 1 file changed, 79 insertions(+), 11 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 3d0827606dbc29..1e0d63ca248808 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -1,22 +1,90 @@
-; RUN: llc < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
 
 define i32 @trunc(i64 %a, i64 %b) {
-; CHECK-LABEL: trunc
-; CHECK: or.b32
-; CHECK-NOT: or.b64
-entry:
+; CHECK-LABEL: trunc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_param_1];
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    or.b32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT:    ret;
   %or = or i64 %a, %b
   %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }
 
-define i32 @trunc_not(i64 %a, i64 %b, ptr %p) {
-; CHECK-LABEL: trunc_not
-; CHECK: or.b64
-; CHECK-NOT: or.b32
-entry:
+define i32 @trunc_not(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_not(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_not_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_not_param_1];
+; CHECK-NEXT:    or.b64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd3;
+; CHECK-NEXT:    mov.u64 %rd4, 0;
+; CHECK-NEXT:    st.u64 [%rd4], %rd3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %or = or i64 %a, %b
   %trunc = trunc i64 %or to i32
-  store i64 %or, ptr %p
+  store i64 %or, ptr null
+  ret i32 %trunc
+}
+
+define i32 @trunc_cvt(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_cvt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_param_1];
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
+; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT:    ret;
+  %add = add i64 %a, %b
+  %or = or i64 %add, %a
+  %trunc = trunc i64 %or to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_cvt_not(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_cvt_not(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_not_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_not_param_1];
+; CHECK-NEXT:    mov.u64 %rd3, 0;
+; CHECK-NEXT:    st.u64 [%rd3], %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
+; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT:    ret;
+  %add = add i64 %a, %b
+  store i64 %b, ptr null
+  %or = or i64 %add, %a
+  %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }

>From e70f281439e4cf3b4a863eba7f9123d1d513cda1 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:52:32 -0700
Subject: [PATCH 06/47] Prefer early return

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 44 ++++++++++-----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb4e7f52bc13d8..31837db13234f1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5685,28 +5685,28 @@ static SDValue PerformTruncCombine(SDNode *N,
   case ISD::XOR: {
     EVT VT = N->getValueType(0);
     EVT LogicalVT = LogicalOp.getValueType();
-    if (VT == MVT::i32 && LogicalVT == MVT::i64) {
-      const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-      if (VT.isScalarInteger() ||
-          TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
-        if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-              return U->isMachineOpcode()
-                         ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
-                         : U->getOpcode() == ISD::TRUNCATE;
-            })) {
-          SDLoc DL(N);
-          SDValue None =
-              DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
-          SDNode *NarrowL = DCI.DAG.getMachineNode(
-              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
-          SDNode *NarrowR = DCI.DAG.getMachineNode(
-              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
-          return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                                 SDValue(NarrowL, 0), SDValue(NarrowR, 0));
-        }
-      }
-    }
-    break;
+    if (VT != MVT::i32 || LogicalVT != MVT::i64) 
+      break;
+    const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+    if (!VT.isScalarInteger() && 
+        !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
+      break;
+    if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+          return U->isMachineOpcode()
+                      ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+                      : U->getOpcode() == ISD::TRUNCATE;
+        }))
+      break;
+
+    SDLoc DL(N);
+    SDValue CVTNone =
+        DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
+    SDNode *NarrowL = DCI.DAG.getMachineNode(
+        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), CVTNone);
+    SDNode *NarrowR = DCI.DAG.getMachineNode(
+        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), CVTNone);
+    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+                            SDValue(NarrowL, 0), SDValue(NarrowR, 0));
   }
   }
 

>From bb93aeb1a58e95bfe3a08ec0d4a6e40835be1274 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:53:08 -0700
Subject: [PATCH 07/47] clang-format

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 31837db13234f1..56005f87649fdd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5685,28 +5685,28 @@ static SDValue PerformTruncCombine(SDNode *N,
   case ISD::XOR: {
     EVT VT = N->getValueType(0);
     EVT LogicalVT = LogicalOp.getValueType();
-    if (VT != MVT::i32 || LogicalVT != MVT::i64) 
+    if (VT != MVT::i32 || LogicalVT != MVT::i64)
       break;
     const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-    if (!VT.isScalarInteger() && 
+    if (!VT.isScalarInteger() &&
         !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
       break;
     if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
           return U->isMachineOpcode()
-                      ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
-                      : U->getOpcode() == ISD::TRUNCATE;
+                     ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+                     : U->getOpcode() == ISD::TRUNCATE;
         }))
       break;
 
     SDLoc DL(N);
     SDValue CVTNone =
         DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
-    SDNode *NarrowL = DCI.DAG.getMachineNode(
-        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), CVTNone);
-    SDNode *NarrowR = DCI.DAG.getMachineNode(
-        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), CVTNone);
-    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                            SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+    SDNode *NarrowL = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
+                                             LogicalOp.getOperand(0), CVTNone);
+    SDNode *NarrowR = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
+                                             LogicalOp.getOperand(1), CVTNone);
+    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT, SDValue(NarrowL, 0),
+                           SDValue(NarrowR, 0));
   }
   }
 

>From d4d310f56f6f0e5c05a1e435456eb37d4b1ccd52 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:55:20 -0700
Subject: [PATCH 08/47] Correct trunc_cvt_not test

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 1e0d63ca248808..30e415ebe95278 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -68,22 +68,22 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
 define i32 @trunc_cvt_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: trunc_cvt_not(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_not_param_0];
 ; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_not_param_1];
-; CHECK-NEXT:    mov.u64 %rd3, 0;
-; CHECK-NEXT:    st.u64 [%rd3], %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    add.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    mov.u64 %rd4, 0;
+; CHECK-NEXT:    st.u64 [%rd4], %rd3;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd3;
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
-; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
 ; CHECK-NEXT:    ret;
   %add = add i64 %a, %b
-  store i64 %b, ptr null
+  store i64 %add, ptr null
   %or = or i64 %add, %a
   %trunc = trunc i64 %or to i32
   ret i32 %trunc

>From 2e0c6cf4f8903e205f5c1262520bfd0c9f5f99e2 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 15 Jul 2024 14:48:10 -0700
Subject: [PATCH 09/47] Delete NVPTXISelLowering implementation

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 54 +--------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 56005f87649fdd..dd2870fac9fa08 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -754,7 +754,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::TRUNCATE, ISD::VSELECT});
+                       ISD::VSELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5666,53 +5666,6 @@ static SDValue PerformREMCombine(SDNode *N,
   return SDValue();
 }
 
-// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
-// This will reduce register pressure.
-static SDValue PerformTruncCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDValue LogicalOp = N->getOperand(0);
-  switch (LogicalOp.getOpcode()) {
-  default:
-    break;
-  case ISD::ADD:
-  case ISD::SUB:
-  case ISD::MUL:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR: {
-    EVT VT = N->getValueType(0);
-    EVT LogicalVT = LogicalOp.getValueType();
-    if (VT != MVT::i32 || LogicalVT != MVT::i64)
-      break;
-    const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-    if (!VT.isScalarInteger() &&
-        !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
-      break;
-    if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-          return U->isMachineOpcode()
-                     ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
-                     : U->getOpcode() == ISD::TRUNCATE;
-        }))
-      break;
-
-    SDLoc DL(N);
-    SDValue CVTNone =
-        DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
-    SDNode *NarrowL = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
-                                             LogicalOp.getOperand(0), CVTNone);
-    SDNode *NarrowR = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
-                                             LogicalOp.getOperand(1), CVTNone);
-    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT, SDValue(NarrowL, 0),
-                           SDValue(NarrowR, 0));
-  }
-  }
-
-  return SDValue();
-}
-
 enum OperandSignedness {
   Signed = 0,
   Unsigned,
@@ -6131,8 +6084,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::UREM:
     case ISD::SREM:
       return PerformREMCombine(N, DCI, OptLevel);
-    case ISD::TRUNCATE:
-      return PerformTruncCombine(N, DCI);
     case ISD::SETCC:
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
@@ -6151,9 +6102,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformVSELECTCombine(N, DCI);
   }
 
-  if (N->isMachineOpcode() && N->getMachineOpcode() == NVPTX::CVT_u32_u64)
-    return PerformTruncCombine(N, DCI);
-
   return SDValue();
 }
 

>From 46cd6d5e3d358f14a98758f5c7c727bb245c56e2 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 15 Jul 2024 15:12:58 -0700
Subject: [PATCH 10/47] Perform optimziation via DAGCombine + TLI

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  2 ++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 +++--
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  4 ++++
 llvm/test/CodeGen/NVPTX/combine-truncate.ll   | 20 +++++++------------
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eda38cd8a564d6..4ef85f4bb59dfc 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2979,6 +2979,8 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
+  virtual bool shouldReduceRegisterPressure() const { return false; }
+
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
 
   /// Return true if the extension represented by \p I is free.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29ff..f59cee06a4ba0d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5786,7 +5786,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
-  if (HandOpcode == ISD::TRUNCATE) {
+  if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
     if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -15321,7 +15321,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     if (!LegalOperations && N0.hasOneUse() &&
-        (isConstantOrConstantVector(N0.getOperand(0), true) ||
+        (TLI.shouldReduceRegisterPressure() ||
+         isConstantOrConstantVector(N0.getOperand(0), true) ||
          isConstantOrConstantVector(N0.getOperand(1), true))) {
       // TODO: We already restricted this to pre-legalization, but for vectors
       // we are extra cautious to not create an unsupported operation.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 32e6b044b0de1f..722496cb12a81c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -505,6 +505,10 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
+  bool shouldReduceRegisterPressure() const override {
+    return true;
+  }
+
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {
     if (VT.isVector())
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 30e415ebe95278..a2b4c444e920f0 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -8,14 +8,11 @@ define i32 @trunc(i64 %a, i64 %b) {
 ; CHECK-LABEL: trunc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_param_1];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    or.b32 %r3, %r2, %r1;
+; CHECK-NEXT:    ld.param.u32 %r1, [trunc_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [trunc_param_1];
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
 ; CHECK-NEXT:    ret;
   %or = or i64 %a, %b
@@ -48,15 +45,12 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
 ; CHECK-LABEL: trunc_cvt(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_param_1];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
-; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
+; CHECK-NEXT:    ld.param.u32 %r1, [trunc_cvt_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [trunc_cvt_param_1];
+; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    or.b32 %r4, %r3, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
 ; CHECK-NEXT:    ret;
   %add = add i64 %a, %b

>From 1e56923e89f3b8779a5de5d01eb4e26f3bb5f691 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 14:01:32 -0700
Subject: [PATCH 11/47] Update variadics-backend.ll

---
 llvm/test/CodeGen/NVPTX/variadics-backend.ll | 65 +++++++++-----------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 0e0c89d3e0214f..59619fa009d1e2 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -11,8 +11,8 @@
 define dso_local i32 @variadics1(i32 noundef %first, ...) {
 ; CHECK-PTX-LABEL: variadics1(
 ; CHECK-PTX:       {
-; CHECK-PTX-NEXT:    .reg .b32 %r<11>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<12>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-NEXT:    .reg .f64 %fd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
@@ -26,23 +26,21 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    add.s32 %r7, %r5, %r6;
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 19;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3];
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r7;
-; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r8, %rd6;
-; CHECK-PTX-NEXT:    add.s64 %rd7, %rd3, 15;
-; CHECK-PTX-NEXT:    and.b64 %rd8, %rd7, -8;
-; CHECK-PTX-NEXT:    ld.f64 %fd1, [%rd8];
-; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd2, %r8;
+; CHECK-PTX-NEXT:    ld.u32 %r8, [%rd3];
+; CHECK-PTX-NEXT:    add.s32 %r9, %r7, %r8;
+; CHECK-PTX-NEXT:    add.s64 %rd4, %rd3, 15;
+; CHECK-PTX-NEXT:    and.b64 %rd5, %rd4, -8;
+; CHECK-PTX-NEXT:    ld.f64 %fd1, [%rd5];
+; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd2, %r9;
 ; CHECK-PTX-NEXT:    add.rn.f64 %fd3, %fd2, %fd1;
-; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r9, %fd3;
-; CHECK-PTX-NEXT:    add.s64 %rd9, %rd8, 15;
-; CHECK-PTX-NEXT:    and.b64 %rd10, %rd9, -8;
-; CHECK-PTX-NEXT:    ld.f64 %fd4, [%rd10];
-; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd5, %r9;
+; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %fd3;
+; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, 15;
+; CHECK-PTX-NEXT:    and.b64 %rd7, %rd6, -8;
+; CHECK-PTX-NEXT:    ld.f64 %fd4, [%rd7];
+; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd5, %r10;
 ; CHECK-PTX-NEXT:    add.rn.f64 %fd6, %fd5, %fd4;
-; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %fd6;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r10;
+; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r11, %fd6;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r11;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8
@@ -152,8 +150,8 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
-; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
@@ -175,13 +173,11 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
 ; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
 ; CHECK-PTX-NEXT:    st.u16 [%SP+0], %rs5;
-; CHECK-PTX-NEXT:    ld.u64 %rd8, [%rd3+8];
-; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
-; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd10;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r6;
+; CHECK-PTX-NEXT:    ld.u32 %r4, [%rd3+8];
+; CHECK-PTX-NEXT:    add.s32 %r5, %r1, %r2;
+; CHECK-PTX-NEXT:    add.s32 %r6, %r5, %r3;
+; CHECK-PTX-NEXT:    add.s32 %r7, %r6, %r4;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r7;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8
@@ -347,20 +343,19 @@ entry:
 define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) {
 ; CHECK-PTX-LABEL: variadics4(
 ; CHECK-PTX:       {
-; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<9>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics4_param_1];
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd5, [variadics4_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd6, [variadics4_param_0+8];
-; CHECK-PTX-NEXT:    add.s64 %rd7, %rd5, %rd6;
-; CHECK-PTX-NEXT:    add.s64 %rd8, %rd7, %rd4;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r1, %rd8;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-PTX-NEXT:    ld.u32 %r1, [%rd3];
+; CHECK-PTX-NEXT:    ld.param.u32 %r2, [variadics4_param_0];
+; CHECK-PTX-NEXT:    ld.param.u32 %r3, [variadics4_param_0+8];
+; CHECK-PTX-NEXT:    add.s32 %r4, %r2, %r3;
+; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r1;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r5;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8

>From bff743d9de2a330bd94a1dc60dd9718b0213e6d7 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 17:49:33 -0700
Subject: [PATCH 12/47] Save work

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 ++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index dd2870fac9fa08..dc20ea41b9da75 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -754,7 +754,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::VSELECT});
+                       ISD::VSELECT, ISD::SELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6066,6 +6066,32 @@ static SDValue PerformLOADCombine(SDNode *N,
       DL);
 }
 
+// This transformations was once reliably performed by instcombine, but thanks
+// to poison semantics they are no longer safe for LLVM IR, perform them here
+// instead.
+static SDValue PerformSELECTCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  return SDValue();
+  if (!(N->getValueType(0) == MVT::i1))
+    return SDValue();
+
+  unsigned Opcode = N->getOpcode();
+  SDValue SecondOperand;
+  if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(2)); Const && Const->isZero()) {
+    // (select cond, x, false) -> (and cond, x)
+    Opcode = ISD::AND;
+    SecondOperand = N->getOperand(1);
+  } else if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(1)); Const && Const->isOne()) {
+    // (select cond, true, x) -> (or cond, x)
+    Opcode = ISD::OR;
+    SecondOperand = N->getOperand(2);
+  } else {
+    return SDValue();
+  }
+
+  return DCI.DAG.getNode(Opcode, SDLoc(N), MVT::i1, N->getOperand(0), SecondOperand);
+}
+
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6088,6 +6114,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
       return PerformLOADCombine(N, DCI);
+    case ISD::SELECT:
+      return PerformSELECTCombine(N, DCI);
     case NVPTXISD::StoreRetval:
     case NVPTXISD::StoreRetvalV2:
     case NVPTXISD::StoreRetvalV4:

>From 810eead79209373295584b37623cb48725db7943 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 17:53:12 -0700
Subject: [PATCH 13/47] Modify boolean-patterns.ll

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     | 6 ------
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 6 ++----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b7e210805db904..691c1dd1321687 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1577,12 +1577,6 @@ defm XOR : BITWISE<"xor", xor>;
 def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
 def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
 
-// These transformations were once reliably performed by instcombine, but thanks
-// to poison semantics they are no longer safe for LLVM IR, perform them here
-// instead.
-def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
-
 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
 foreach vt = [v2i16, v4i8] in {
   def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index d38880599d1e62..d0b93763682398 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -18,16 +18,14 @@ define i1 @m2and_ri(i1 %a) {
 
 ; CHECK-LABEL: select2or
 define i1 @select2or(i1 %a, i1 %b) {
-; CHECK: or.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK-NOT: selp
+; CHECK: or.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
   %r = select i1 %a, i1 1, i1 %b
   ret i1 %r
 }
 
 ; CHECK-LABEL: select2and
 define i1 @select2and(i1 %a, i1 %b) {
-; CHECK: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK-NOT: selp
+; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
   %r = select i1 %a, i1 %b, i1 0
   ret i1 %r
 }

>From f7de33662d94f38a22874c3d31bdc79a71c61d64 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 18:51:45 -0700
Subject: [PATCH 14/47] Revert "Save work"

This reverts commit 680ac60d4e67194e6ba9c2ee8ed14fbe2b49cdc9.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 +--------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index dc20ea41b9da75..dd2870fac9fa08 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -754,7 +754,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::VSELECT, ISD::SELECT});
+                       ISD::VSELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6066,32 +6066,6 @@ static SDValue PerformLOADCombine(SDNode *N,
       DL);
 }
 
-// This transformations was once reliably performed by instcombine, but thanks
-// to poison semantics they are no longer safe for LLVM IR, perform them here
-// instead.
-static SDValue PerformSELECTCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
-  return SDValue();
-  if (!(N->getValueType(0) == MVT::i1))
-    return SDValue();
-
-  unsigned Opcode = N->getOpcode();
-  SDValue SecondOperand;
-  if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(2)); Const && Const->isZero()) {
-    // (select cond, x, false) -> (and cond, x)
-    Opcode = ISD::AND;
-    SecondOperand = N->getOperand(1);
-  } else if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(1)); Const && Const->isOne()) {
-    // (select cond, true, x) -> (or cond, x)
-    Opcode = ISD::OR;
-    SecondOperand = N->getOperand(2);
-  } else {
-    return SDValue();
-  }
-
-  return DCI.DAG.getNode(Opcode, SDLoc(N), MVT::i1, N->getOperand(0), SecondOperand);
-}
-
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6114,8 +6088,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
       return PerformLOADCombine(N, DCI);
-    case ISD::SELECT:
-      return PerformSELECTCombine(N, DCI);
     case NVPTXISD::StoreRetval:
     case NVPTXISD::StoreRetvalV2:
     case NVPTXISD::StoreRetvalV4:

>From a477a0d40a1810ffae398e3f6b5d3e35c74f00f5 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:37:24 -0700
Subject: [PATCH 15/47] Save work

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f59cee06a4ba0d..7310f7034b0590 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15365,6 +15365,17 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
+  switch (N0.getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    // if ()
+    break; 
+  }
+
   return SDValue();
 }
 

>From 66142fc02b171ccb8dac6398cc793a0411f56f08 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:38:25 -0700
Subject: [PATCH 16/47] Remove improper TLI use

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7310f7034b0590..260c96847d6a5f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15321,8 +15321,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     if (!LegalOperations && N0.hasOneUse() &&
-        (TLI.shouldReduceRegisterPressure() ||
-         isConstantOrConstantVector(N0.getOperand(0), true) ||
+        (isConstantOrConstantVector(N0.getOperand(0), true) ||
          isConstantOrConstantVector(N0.getOperand(1), true))) {
       // TODO: We already restricted this to pre-legalization, but for vectors
       // we are extra cautious to not create an unsupported operation.

>From efde452b79df552095c14606fb2d7c9164b99a96 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:39:53 -0700
Subject: [PATCH 17/47] Remove white space

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index dd2870fac9fa08..bb76ffdfd99d7b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6101,7 +6101,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::VSELECT:
       return PerformVSELECTCombine(N, DCI);
   }
-
   return SDValue();
 }
 

>From 30bed099cf1f030461767eecac7073cd36cee7b0 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:40:27 -0700
Subject: [PATCH 18/47] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 260c96847d6a5f..14dd8234e15aa7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15372,7 +15372,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     // if ()
-    break; 
+    break;
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 722496cb12a81c..60837b924e3e38 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -505,9 +505,7 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
-  bool shouldReduceRegisterPressure() const override {
-    return true;
-  }
+  bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {

>From 5cda37d1b6d5696295894d229a10b51ee0a38f8e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 11:41:20 -0700
Subject: [PATCH 19/47] Implement transform in DAG combine

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 ++++++++++++-------
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  6 +++++
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 14dd8234e15aa7..c55dd703416dac 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15364,15 +15364,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
-  switch (N0.getOpcode()) {
-  case ISD::ADD:
-  case ISD::SUB:
-  case ISD::MUL:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR:
-    // if ()
-    break;
+  if (TLI.shouldReduceRegisterPressure()) {
+    switch (N0.getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      if (!(N0.hasOneUse() && VT.isScalarInteger()))
+        break;
+      if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
+        break;
+      SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+      SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+      return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+    }
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 691c1dd1321687..b7e210805db904 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1577,6 +1577,12 @@ defm XOR : BITWISE<"xor", xor>;
 def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
 def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
 
+// These transformations were once reliably performed by instcombine, but thanks
+// to poison semantics they are no longer safe for LLVM IR, perform them here
+// instead.
+def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
 foreach vt = [v2i16, v4i8] in {
   def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),

>From 262a5e78d486b800a8f7bfc5d701b0be4b39a5fb Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:36:45 -0700
Subject: [PATCH 20/47] Save work

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 60837b924e3e38..e984656ddfbc45 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -499,12 +499,20 @@ class NVPTXTargetLowering : public TargetLowering {
 
   bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
     // Truncating 64-bit to 32-bit is free in SASS.
-    if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+    if (!(SrcTy->isIntegerTy() && DstTy->isIntegerTy()))
       return false;
     return SrcTy->getPrimitiveSizeInBits() == 64 &&
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
+  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+    // Truncating 64-bit to 32-bit is free in SASS.
+    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+      return false;
+    return FromVT.getFixedSizeInBits() == 64 &&
+           ToVT.getFixedSizeInBits() == 32;
+  }
+
   bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,

>From 07b991563b7e8d9a2d592f6ab61860161983c663 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:37:01 -0700
Subject: [PATCH 21/47] Remove isTruncateFree

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index e984656ddfbc45..ff40283faa6d71 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -505,14 +505,6 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
-  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
-    // Truncating 64-bit to 32-bit is free in SASS.
-    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
-      return false;
-    return FromVT.getFixedSizeInBits() == 64 &&
-           ToVT.getFixedSizeInBits() == 32;
-  }
-
   bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,

>From 9416ee000fd0de3a31f3f5256afbafb1df76cba7 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:45:56 -0700
Subject: [PATCH 22/47] Use ptr in test

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index a2b4c444e920f0..221b89c7812e78 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -59,7 +59,7 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
   ret i32 %trunc
 }
 
-define i32 @trunc_cvt_not(i64 %a, i64 %b) {
+define i32 @trunc_cvt_not(i64 %a, i64 %b, ptr %p) {
 ; CHECK-LABEL: trunc_cvt_not(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
@@ -69,7 +69,7 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_not_param_0];
 ; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_not_param_1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    mov.u64 %rd4, 0;
+; CHECK-NEXT:    ld.param.u64 %rd4, [trunc_cvt_not_param_2];
 ; CHECK-NEXT:    st.u64 [%rd4], %rd3;
 ; CHECK-NEXT:    cvt.u32.u64 %r1, %rd3;
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
@@ -77,7 +77,7 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b) {
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
 ; CHECK-NEXT:    ret;
   %add = add i64 %a, %b
-  store i64 %add, ptr null
+  store i64 %add, ptr %p
   %or = or i64 %add, %a
   %trunc = trunc i64 %or to i32
   ret i32 %trunc

>From ef981993603ca0aea57085d7a6aaa9f9a8fc0bc4 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:55:31 -0700
Subject: [PATCH 23/47] Only run on free truncs

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  8 ++++++++
 llvm/test/CodeGen/NVPTX/combine-truncate.ll   | 16 ++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c55dd703416dac..50c83def70017f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15372,7 +15372,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
-      if (!(N0.hasOneUse() && VT.isScalarInteger()))
+      if (!(N0.hasOneUse() && VT.isScalarInteger() && TLI.isTruncateFree(SrcVT, VT)))
         break;
       if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
         break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index ff40283faa6d71..e984656ddfbc45 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -505,6 +505,14 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
+  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+    // Truncating 64-bit to 32-bit is free in SASS.
+    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+      return false;
+    return FromVT.getFixedSizeInBits() == 64 &&
+           ToVT.getFixedSizeInBits() == 32;
+  }
+
   bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 221b89c7812e78..1c1c091e39bb41 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -82,3 +82,19 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b, ptr %p) {
   %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }
+
+define i16 @trunc_i32_to_i16_not(i32 %a, i32 %b) {
+; CHECK-LABEL: trunc_i32_to_i16_not(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [trunc_i32_to_i16_not_param_0];
+; CHECK-NEXT:    ld.param.u16 %r2, [trunc_i32_to_i16_not_param_1];
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT:    ret;
+  %or = or i32 %a, %b
+  %trunc = trunc i32 %or to i16
+  ret i16 %trunc
+}

>From 1f7285aa314a84e0ceb72ad348e8cd2c5bcf22f3 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:04:56 -0700
Subject: [PATCH 24/47] Add comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 50c83def70017f..23f5f20cdc2ae2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5786,6 +5786,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
+  // 
+  // For targets that are particulaly senesitve to register pressure it's preferable to
+  // increase the number of truncate instructions in order to decrease the bit 
+  // width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.

>From 8794daf527ca3097dfc69994d8ade74cf875a28a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:05:32 -0700
Subject: [PATCH 25/47] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 ++++++-----
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  3 +--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 23f5f20cdc2ae2..2cd0e04f2b7c75 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5786,10 +5786,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
-  // 
-  // For targets that are particulaly senesitve to register pressure it's preferable to
-  // increase the number of truncate instructions in order to decrease the bit 
-  // width of the logic_op.
+  //
+  // For targets that are particulaly senesitve to register pressure it's
+  // preferable to increase the number of truncate instructions in order to
+  // decrease the bit width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
@@ -15376,7 +15376,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
-      if (!(N0.hasOneUse() && VT.isScalarInteger() && TLI.isTruncateFree(SrcVT, VT)))
+      if (!(N0.hasOneUse() && VT.isScalarInteger() &&
+            TLI.isTruncateFree(SrcVT, VT)))
         break;
       if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
         break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index e984656ddfbc45..34d197f7f1551a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -509,8 +509,7 @@ class NVPTXTargetLowering : public TargetLowering {
     // Truncating 64-bit to 32-bit is free in SASS.
     if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
       return false;
-    return FromVT.getFixedSizeInBits() == 64 &&
-           ToVT.getFixedSizeInBits() == 32;
+    return FromVT.getFixedSizeInBits() == 64 && ToVT.getFixedSizeInBits() == 32;
   }
 
   bool shouldReduceRegisterPressure() const override { return true; }

>From 71b1c901eb1de5c79d4bbde5a43a28ed8664d6a8 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:06:47 -0700
Subject: [PATCH 26/47] explcitly mentino GPU in comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2cd0e04f2b7c75..064d048c16d778 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5787,7 +5787,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
   //
-  // For targets that are particulaly senesitve to register pressure it's
+  // For targets that are particulaly senesitve to register pressure (e.g. GPUs) it's
   // preferable to increase the number of truncate instructions in order to
   // decrease the bit width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {

>From c067c3ecd17b0a42ddcdf529869b9bcfb60f419c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:06:59 -0700
Subject: [PATCH 27/47] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 064d048c16d778..51bcb42401b83c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5787,8 +5787,8 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
   //
-  // For targets that are particulaly senesitve to register pressure (e.g. GPUs) it's
-  // preferable to increase the number of truncate instructions in order to
+  // For targets that are particulaly senesitve to register pressure (e.g. GPUs)
+  // it's preferable to increase the number of truncate instructions in order to
   // decrease the bit width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra

>From d11c8388b99fbed1c0d55951d95e2d781c20d980 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:11:56 -0700
Subject: [PATCH 28/47] Add comment to TLI function

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4ef85f4bb59dfc..51fbd15efa78f3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2979,6 +2979,8 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
+  // Return true if the target will accepts tradeoffs (e.g. increase the number 
+  // of instructions) to reduce register pressure.
   virtual bool shouldReduceRegisterPressure() const { return false; }
 
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }

>From 0a7dea9f9145f53d70eea8492c3bf70a5ea560e3 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:12:11 -0700
Subject: [PATCH 29/47] clang-format

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 51fbd15efa78f3..e0ae6a1988106c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2979,7 +2979,7 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
-  // Return true if the target will accepts tradeoffs (e.g. increase the number 
+  // Return true if the target will accepts tradeoffs (e.g. increase the number
   // of instructions) to reduce register pressure.
   virtual bool shouldReduceRegisterPressure() const { return false; }
 

>From 8702dd128ce4c57ccd703469933b2daacd9217e0 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:41:32 -0700
Subject: [PATCH 30/47] Update boolean-patterns.ll

---
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 72 ++++++++++++++++++---
 1 file changed, 63 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index d0b93763682398..de0437cb96e398 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -1,31 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK-LABEL: m2and_rr
+target triple = "nvptx64-nvidia-cuda"
+
 define i1 @m2and_rr(i1 %a, i1 %b) {
-; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
-; CHECK-NOT: mul
+; CHECK-LABEL: m2and_rr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [m2and_rr_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u8 %rs3, [m2and_rr_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT:    and.pred %p3, %p2, %p1;
+; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %r = mul i1 %a, %b
   ret i1 %r
 }
 
-; CHECK-LABEL: m2and_ri
 define i1 @m2and_ri(i1 %a) {
-; CHECK-NOT: mul
+; CHECK-LABEL: m2and_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %r1, [m2and_ri_param_0];
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
   %r = mul i1 %a, 1
   ret i1 %r
 }
 
-; CHECK-LABEL: select2or
 define i1 @select2or(i1 %a, i1 %b) {
-; CHECK: or.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
+; CHECK-LABEL: select2or(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u8 %rs3, [select2or_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT:    or.pred %p3, %p2, %p1;
+; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 1, i1 %b
   ret i1 %r
 }
 
-; CHECK-LABEL: select2and
 define i1 @select2and(i1 %a, i1 %b) {
-; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
+; CHECK-LABEL: select2and(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u8 %rs3, [select2and_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT:    and.pred %p3, %p2, %p1;
+; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 %b, i1 0
   ret i1 %r
 }

>From 79034e9885ac7ad05ad95b6aab0e5c18db801b55 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 23 Jul 2024 16:12:00 -0700
Subject: [PATCH 31/47]  Use isNarrowingProfitable instead of
 shouldReduceRegisterPressure

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 --
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++-----
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  8 ++--
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll   | 40 ++++++++-----------
 llvm/test/CodeGen/X86/apx/and.ll              |  6 +--
 ...-merge-scalar-constmask-interleavedbits.ll | 16 ++++----
 llvm/test/CodeGen/X86/vec_saddo.ll            |  8 +++-
 llvm/test/CodeGen/X86/vec_uaddo.ll            |  8 +++-
 8 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ae6a1988106c..eda38cd8a564d6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2979,10 +2979,6 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
-  // Return true if the target will accepts tradeoffs (e.g. increase the number
-  // of instructions) to reduce register pressure.
-  virtual bool shouldReduceRegisterPressure() const { return false; }
-
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
 
   /// Return true if the extension represented by \p I is free.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 51bcb42401b83c..4a9e416f95f0a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5786,11 +5786,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
-  //
-  // For targets that are particulaly senesitve to register pressure (e.g. GPUs)
-  // it's preferable to increase the number of truncate instructions in order to
-  // decrease the bit width of the logic_op.
-  if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
+  if (HandOpcode == ISD::TRUNCATE) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
     if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -5802,9 +5798,14 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
       return SDValue();
     // Be extra careful sinking truncate. If it's free, there's no benefit in
-    // widening a binop. Also, don't create a logic op on an illegal type.
+    // widening a binop.
     if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
       return SDValue();
+    // Prevent an infinite loop if the target preferts the inverse 
+    // transformation.
+    if (TLI.isNarrowingProfitable(XVT, VT))
+      return SDValue();
+    // Don't create a logic op on an illegal type.
     if (!TLI.isTypeLegal(XVT))
       return SDValue();
     SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
@@ -15368,7 +15369,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
-  if (TLI.shouldReduceRegisterPressure()) {
+  if (TLI.isNarrowingProfitable(SrcVT, VT)) {
     switch (N0.getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:
@@ -15376,8 +15377,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
-      if (!(N0.hasOneUse() && VT.isScalarInteger() &&
-            TLI.isTruncateFree(SrcVT, VT)))
+      if (!(N0.hasOneUse() && VT.isScalarInteger()))
         break;
       if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
         break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 34d197f7f1551a..6b0351d7bbd2b1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -505,15 +505,13 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
-  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+  bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override {
     // Truncating 64-bit to 32-bit is free in SASS.
-    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+    if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
       return false;
-    return FromVT.getFixedSizeInBits() == 64 && ToVT.getFixedSizeInBits() == 32;
+    return SrcVT.getFixedSizeInBits() == 64 && DestVT.getFixedSizeInBits() == 32;
   }
 
-  bool shouldReduceRegisterPressure() const override { return true; }
-
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {
     if (VT.isVector())
diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index de0437cb96e398..c30ebccdf9199c 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -43,20 +43,16 @@ define i1 @m2and_ri(i1 %a) {
 define i1 @select2or(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2or(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<4>;
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_1];
-; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT:    ld.param.u8 %rs3, [select2or_param_0];
-; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
-; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
-; CHECK-NEXT:    or.pred %p3, %p2, %p1;
-; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [select2or_param_1];
+; CHECK-NEXT:    or.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 1, i1 %b
   ret i1 %r
@@ -65,20 +61,16 @@ define i1 @select2or(i1 %a, i1 %b) {
 define i1 @select2and(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2and(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<4>;
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_1];
-; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT:    ld.param.u8 %rs3, [select2and_param_0];
-; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
-; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
-; CHECK-NEXT:    and.pred %p3, %p2, %p1;
-; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [select2and_param_1];
+; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 %b, i1 0
   ret i1 %r
diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll
index 23aed77b948b97..de27976a122f7b 100644
--- a/llvm/test/CodeGen/X86/apx/and.ll
+++ b/llvm/test/CodeGen/X86/apx/and.ll
@@ -5,14 +5,12 @@
 define i8 @and8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: and8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    andb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x20,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and8rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
-; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    {nf} andb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x20,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i8 %a, %b
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
index c4c4e5ed1fddeb..2e2c152c5506ad 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
@@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $85, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $85, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $85, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $85, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 460c5fe11f82a5..0aea89e4acf018 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,12 +1045,16 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: saddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index bac118095331ca..34e2811cb9bb5c 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,12 +1098,16 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq

>From 95702a28623408f54a37b0ac68fa49eba206ac90 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 23 Jul 2024 16:13:40 -0700
Subject: [PATCH 32/47] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4a9e416f95f0a6..012cb1eac2a8b4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5801,7 +5801,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     // widening a binop.
     if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
       return SDValue();
-    // Prevent an infinite loop if the target preferts the inverse 
+    // Prevent an infinite loop if the target preferts the inverse
     // transformation.
     if (TLI.isNarrowingProfitable(XVT, VT))
       return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 6b0351d7bbd2b1..2c4579f81bbd95 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -509,7 +509,8 @@ class NVPTXTargetLowering : public TargetLowering {
     // Truncating 64-bit to 32-bit is free in SASS.
     if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
       return false;
-    return SrcVT.getFixedSizeInBits() == 64 && DestVT.getFixedSizeInBits() == 32;
+    return SrcVT.getFixedSizeInBits() == 64 &&
+           DestVT.getFixedSizeInBits() == 32;
   }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,

>From b4b8f2a4d4730f4362778e60eabd94ecb6aa9de6 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 19:40:50 -0700
Subject: [PATCH 33/47] Update tests

---
 llvm/test/CodeGen/X86/2012-08-16-setcc.ll    | 10 ++---
 llvm/test/CodeGen/X86/add-and-not.ll         | 16 ++++----
 llvm/test/CodeGen/X86/and-encoding.ll        |  4 +-
 llvm/test/CodeGen/X86/and-with-overflow.ll   |  4 +-
 llvm/test/CodeGen/X86/atomic-unordered.ll    |  2 +-
 llvm/test/CodeGen/X86/avx512-calling-conv.ll |  8 ++--
 llvm/test/CodeGen/X86/avx512-intrinsics.ll   | 40 ++++++++++----------
 llvm/test/CodeGen/X86/avx512dq-mask-op.ll    |  2 +-
 llvm/test/CodeGen/X86/cmp-bool.ll            |  2 +-
 llvm/test/CodeGen/X86/cmp-concat.ll          |  2 +-
 llvm/test/CodeGen/X86/ctlo.ll                | 14 +++----
 llvm/test/CodeGen/X86/ctlz.ll                | 32 ++++++++--------
 12 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/llvm/test/CodeGen/X86/2012-08-16-setcc.ll b/llvm/test/CodeGen/X86/2012-08-16-setcc.ll
index 89ae5680e3ba94..f82439a7d82eec 100644
--- a/llvm/test/CodeGen/X86/2012-08-16-setcc.ll
+++ b/llvm/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -6,8 +6,8 @@
 define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 ; CHECK-LABEL: and_1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %dil, %sil
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = and i8 %b, %a
@@ -19,7 +19,7 @@ define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: and_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    testl %edi, %esi
+; CHECK-NEXT:    testb %dil, %sil
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
   %1 = and i8 %b, %a
@@ -31,7 +31,7 @@ define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 ; CHECK-LABEL: xor_1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %dil, %sil
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = xor i8 %b, %a
@@ -43,7 +43,7 @@ define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: xor_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %dil, %sil
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
   %1 = xor i8 %b, %a
diff --git a/llvm/test/CodeGen/X86/add-and-not.ll b/llvm/test/CodeGen/X86/add-and-not.ll
index 10e3a6bf6d5330..3f1798e2f9bdd4 100644
--- a/llvm/test/CodeGen/X86/add-and-not.ll
+++ b/llvm/test/CodeGen/X86/add-and-not.ll
@@ -13,8 +13,8 @@ define i8 @add_and_xor(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
@@ -79,8 +79,8 @@ define i8 @add_and_xor_commuted1(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor_commuted1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
@@ -98,8 +98,8 @@ define i8 @add_and_xor_commuted2(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor_commuted2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
@@ -117,8 +117,8 @@ define i8 @add_and_xor_commuted3(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor_commuted3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
diff --git a/llvm/test/CodeGen/X86/and-encoding.ll b/llvm/test/CodeGen/X86/and-encoding.ll
index 248686ff8b7a2b..64897a1d432221 100644
--- a/llvm/test/CodeGen/X86/and-encoding.ll
+++ b/llvm/test/CodeGen/X86/and-encoding.ll
@@ -22,7 +22,7 @@ define void @f1() nounwind {
 define void @f2(i16 %x, ptr%y) nounwind  {
 ; CHECK-LABEL: f2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01]
 ; CHECK-NEXT:    movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %c = trunc i16 %x to i1
@@ -33,7 +33,7 @@ define void @f2(i16 %x, ptr%y) nounwind  {
 define void @f3(i32 %x, ptr%y) nounwind {
 ; CHECK-LABEL: f3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01]
 ; CHECK-NEXT:    movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %c = trunc i32 %x to i1
diff --git a/llvm/test/CodeGen/X86/and-with-overflow.ll b/llvm/test/CodeGen/X86/and-with-overflow.ll
index a63f6cc6ea7e26..70c8317915239a 100644
--- a/llvm/test/CodeGen/X86/and-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/and-with-overflow.ll
@@ -46,8 +46,8 @@ define i8 @and_i8_rr(i8 zeroext %0, i8 zeroext %1) {
 ;
 ; X64-LABEL: and_i8_rr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    andb %dil, %sil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..eb7a30f4742772 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2009,7 +2009,7 @@ define i32 @split_load(ptr %p) {
 ; CHECK-O3-NEXT:    movq (%rdi), %rax
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    orl %eax, %ecx
+; CHECK-O3-NEXT:    orb %al, %cl
 ; CHECK-O3-NEXT:    movzbl %cl, %eax
 ; CHECK-O3-NEXT:    retq
   %v = load atomic i64, ptr %p unordered, align 8
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b39b089faa2a5e..bdbbf8a048a09c 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -938,7 +938,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftrw $14, %k0, %k1
-; KNL-NEXT:    andl $1, %edx
+; KNL-NEXT:    andb $1, %dl
 ; KNL-NEXT:    movb %dl, 2(%rax)
 ; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    andl $1, %edx
@@ -1251,7 +1251,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftrd $13, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r13d
 ; SKX-NEXT:    kshiftrd $14, %k0, %k1
-; SKX-NEXT:    andl $1, %edx
+; SKX-NEXT:    andb $1, %dl
 ; SKX-NEXT:    movb %dl, 2(%rax)
 ; SKX-NEXT:    kmovd %k0, %edx
 ; SKX-NEXT:    andl $1, %edx
@@ -1564,7 +1564,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ecx
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    movb %bl, 2(%eax)
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebx
@@ -1892,7 +1892,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftrd $13, %k0, %k1
 ; FASTISEL-NEXT:    kmovd %k1, %r13d
 ; FASTISEL-NEXT:    kshiftrd $14, %k0, %k1
-; FASTISEL-NEXT:    andl $1, %edx
+; FASTISEL-NEXT:    andb $1, %dl
 ; FASTISEL-NEXT:    movb %dl, 2(%rax)
 ; FASTISEL-NEXT:    kmovd %k0, %edx
 ; FASTISEL-NEXT:    andl $1, %edx
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index b77c753107a6e1..ff4a2f99c015d5 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -4548,17 +4548,17 @@ define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    orl %ecx, %edx
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    orb %cl, %dl
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    orb %dl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vcmplesd %xmm1, %xmm0, %k0
@@ -4566,14 +4566,14 @@ define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ; X86-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
 ; X86-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %esi
+; X86-NEXT:    kmovw %k0, %ebx
 ; X86-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    orb %bl, %al
+; X86-NEXT:    orb %dl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 
@@ -4625,17 +4625,17 @@ define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    andl %ecx, %edx
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andb %cl, %dl
+; X64-NEXT:    andb %sil, %al
+; X64-NEXT:    andb %dl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vcmpless %xmm1, %xmm0, %k0
@@ -4643,14 +4643,14 @@ define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ; X86-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
 ; X86-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %esi
+; X86-NEXT:    kmovw %k0, %ebx
 ; X86-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    andb %bl, %al
+; X86-NEXT:    andb %dl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
index 041a86aff53fb6..17502f4a506f13 100644
--- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -33,7 +33,7 @@ define i8 @mand8(i8 %x, i8 %y) {
 ; CHECK-LABEL: mand8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %ma = bitcast i8 %x to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/cmp-bool.ll b/llvm/test/CodeGen/X86/cmp-bool.ll
index 617b485e0de0f2..5ba3680c1b74de 100644
--- a/llvm/test/CodeGen/X86/cmp-bool.ll
+++ b/llvm/test/CodeGen/X86/cmp-bool.ll
@@ -4,7 +4,7 @@
 define void @bool_eq(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind {
 ; CHECK-LABEL: bool_eq:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    cmpb %sil, %dil
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll
index 5e030de1409f22..816f16667734cc 100644
--- a/llvm/test/CodeGen/X86/cmp-concat.ll
+++ b/llvm/test/CodeGen/X86/cmp-concat.ll
@@ -4,7 +4,7 @@
 define i1 @cmp_allbits_concat_i8(i8 %x, i8 %y) {
 ; CHECK-LABEL: cmp_allbits_concat_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    andb %sil, %dil
 ; CHECK-NEXT:    cmpb $-1, %dil
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index f383c9a2544fca..ac8ab6cebf0a3d 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -21,7 +21,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
 ; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NOCMOV-NEXT:    retl
 ; X86-NOCMOV-NEXT:  .LBB0_1:
@@ -37,7 +37,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X86-CMOV-NEXT:    bsrl %eax, %ecx
 ; X86-CMOV-NEXT:    movl $15, %eax
 ; X86-CMOV-NEXT:    cmovnel %ecx, %eax
-; X86-CMOV-NEXT:    xorl $7, %eax
+; X86-CMOV-NEXT:    xorb $7, %al
 ; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CMOV-NEXT:    retl
 ;
@@ -48,7 +48,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X64-NEXT:    bsrl %eax, %ecx
 ; X64-NEXT:    movl $15, %eax
 ; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -58,7 +58,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X86-CLZ-NEXT:    notb %al
 ; X86-CLZ-NEXT:    movzbl %al, %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:    addl $-24, %eax
+; X86-CLZ-NEXT:    addb $-24, %al
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
@@ -67,7 +67,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X64-CLZ-NEXT:    notb %dil
 ; X64-CLZ-NEXT:    movzbl %dil, %eax
 ; X64-CLZ-NEXT:    lzcntl %eax, %eax
-; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    addb $-24, %al
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp1 = xor i8 %x, -1
@@ -82,7 +82,7 @@ define i8 @ctlo_i8_undef(i8 %x) {
 ; X86-NEXT:    notb %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -91,7 +91,7 @@ define i8 @ctlo_i8_undef(i8 %x) {
 ; X64-NEXT:    notb %dil
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 68defaff78d37d..e7a11cf443f5e0 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -17,7 +17,7 @@ define i8 @ctlz_i8(i8 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -25,7 +25,7 @@ define i8 @ctlz_i8(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -226,7 +226,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
 ; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NOCMOV-NEXT:    retl
 ; X86-NOCMOV-NEXT:  .LBB4_1:
@@ -240,7 +240,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-CMOV-NEXT:    bsrl %eax, %ecx
 ; X86-CMOV-NEXT:    movl $15, %eax
 ; X86-CMOV-NEXT:    cmovnel %ecx, %eax
-; X86-CMOV-NEXT:    xorl $7, %eax
+; X86-CMOV-NEXT:    xorb $7, %al
 ; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CMOV-NEXT:    retl
 ;
@@ -250,7 +250,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X64-NEXT:    bsrl %eax, %ecx
 ; X64-NEXT:    movl $15, %eax
 ; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -258,7 +258,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:    addl $-24, %eax
+; X86-CLZ-NEXT:    addb $-24, %al
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
@@ -266,7 +266,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X64-CLZ:       # %bb.0:
 ; X64-CLZ-NEXT:    movzbl %dil, %eax
 ; X64-CLZ-NEXT:    lzcntl %eax, %eax
-; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    addb $-24, %al
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
 ;
@@ -274,7 +274,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X64-FASTLZCNT:       # %bb.0:
 ; X64-FASTLZCNT-NEXT:    movzbl %dil, %eax
 ; X64-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT:    addl $-24, %eax
+; X64-FASTLZCNT-NEXT:    addb $-24, %al
 ; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-FASTLZCNT-NEXT:    retq
 ;
@@ -282,7 +282,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    addl $-24, %eax
+; X86-FASTLZCNT-NEXT:    addb $-24, %al
 ; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-FASTLZCNT-NEXT:    retl
   %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false)
@@ -653,7 +653,7 @@ define i8 @ctlz_i8_knownbits(i8 %x)  {
 ; X86-NEXT:    orb $64, %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -662,7 +662,7 @@ define i8 @ctlz_i8_knownbits(i8 %x)  {
 ; X64-NEXT:    orb $64, %dil
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -962,7 +962,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
 ; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NOCMOV-NEXT:    retl
@@ -994,7 +994,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:    addl $-24, %eax
+; X86-CLZ-NEXT:    addb $-24, %al
 ; X86-CLZ-NEXT:    xorb $7, %al
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
@@ -1003,7 +1003,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X64-CLZ:       # %bb.0:
 ; X64-CLZ-NEXT:    movzbl %dil, %eax
 ; X64-CLZ-NEXT:    lzcntl %eax, %eax
-; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    addb $-24, %al
 ; X64-CLZ-NEXT:    xorb $7, %al
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
@@ -1012,7 +1012,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X64-FASTLZCNT:       # %bb.0:
 ; X64-FASTLZCNT-NEXT:    movzbl %dil, %eax
 ; X64-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT:    addl $-24, %eax
+; X64-FASTLZCNT-NEXT:    addb $-24, %al
 ; X64-FASTLZCNT-NEXT:    xorb $7, %al
 ; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-FASTLZCNT-NEXT:    retq
@@ -1021,7 +1021,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    addl $-24, %eax
+; X86-FASTLZCNT-NEXT:    addb $-24, %al
 ; X86-FASTLZCNT-NEXT:    xorb $7, %al
 ; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-FASTLZCNT-NEXT:    retl

>From a4360a94b3ae9ac78dd859a024f4b05d46567a7f Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 19:54:31 -0700
Subject: [PATCH 34/47] Update more tests

---
 llvm/test/CodeGen/X86/apx/or.ll               |  6 +-
 llvm/test/CodeGen/X86/apx/xor.ll              | 14 ++-
 .../CodeGen/X86/dag-update-nodetomatch.ll     | 30 +++---
 llvm/test/CodeGen/X86/fast-isel-fcmp.ll       |  8 +-
 llvm/test/CodeGen/X86/fptosi-sat-scalar.ll    | 16 ++--
 llvm/test/CodeGen/X86/fptoui-sat-scalar.ll    | 16 ++--
 llvm/test/CodeGen/X86/isel-and.ll             |  4 +-
 llvm/test/CodeGen/X86/isel-or.ll              |  4 +-
 llvm/test/CodeGen/X86/isel-xor.ll             |  4 +-
 llvm/test/CodeGen/X86/jump_sign.ll            |  3 +-
 llvm/test/CodeGen/X86/known-pow2.ll           |  2 +-
 llvm/test/CodeGen/X86/large-pic-string.ll     |  2 +-
 llvm/test/CodeGen/X86/logic-shift.ll          | 20 ++--
 llvm/test/CodeGen/X86/lzcnt.ll                |  6 +-
 llvm/test/CodeGen/X86/masked_store_trunc.ll   | 90 +++++++++---------
 .../CodeGen/X86/masked_store_trunc_ssat.ll    | 90 +++++++++---------
 .../CodeGen/X86/masked_store_trunc_usat.ll    | 94 +++++++++----------
 llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll |  6 +-
 llvm/test/CodeGen/X86/mixed-ptr-sizes.ll      |  4 +-
 llvm/test/CodeGen/X86/or-with-overflow.ll     |  4 +-
 llvm/test/CodeGen/X86/parity-vec.ll           | 14 +--
 llvm/test/CodeGen/X86/parity.ll               |  7 +-
 llvm/test/CodeGen/X86/pr14088.ll              |  8 +-
 llvm/test/CodeGen/X86/pr15267.ll              | 29 +++---
 llvm/test/CodeGen/X86/pr35761.ll              |  8 +-
 llvm/test/CodeGen/X86/pr40539.ll              |  2 +-
 .../CodeGen/X86/replace-load-and-with-bzhi.ll |  4 +-
 llvm/test/CodeGen/X86/sdiv_fix_sat.ll         |  8 +-
 llvm/test/CodeGen/X86/setcc-logic.ll          | 12 +--
 llvm/test/CodeGen/X86/setoeq.ll               |  4 +-
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    | 12 +--
 .../statepoint-spill-slot-size-promotion.ll   |  2 +-
 ...asked-merge-scalar-constmask-innerouter.ll | 16 ++--
 ...-scalar-constmask-interleavedbytehalves.ll | 16 ++--
 ...d-masked-merge-scalar-constmask-lowhigh.ll | 16 ++--
 ...unfold-masked-merge-vector-variablemask.ll | 87 ++++++++---------
 .../test/CodeGen/X86/vector-compare-all_of.ll |  4 +-
 .../test/CodeGen/X86/vector-compare-any_of.ll |  2 +-
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 12 +--
 llvm/test/CodeGen/X86/xor-with-overflow.ll    |  4 +-
 .../LoopStrengthReduce/X86/ivchain-X86.ll     | 42 ++++-----
 41 files changed, 353 insertions(+), 379 deletions(-)

diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index e51ba9d9bf0391..593c16dea09a28 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -5,14 +5,12 @@
 define i8 @or8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: or8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    orb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x08,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or8rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
-; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    {nf} orb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x08,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i8 %a, %b
diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll
index d908849e2848f3..4b67a4af7a6423 100644
--- a/llvm/test/CodeGen/X86/apx/xor.ll
+++ b/llvm/test/CodeGen/X86/apx/xor.ll
@@ -5,14 +5,12 @@
 define i8 @xor8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: xor8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    xorb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor8rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
-; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    {nf} xorb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i8 %a, %b
@@ -428,8 +426,8 @@ entry:
 define i1 @xorflag8rr(i8 %a, i8 %b) {
 ; CHECK-LABEL: xorflag8rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe]
-; CHECK-NEXT:    xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff]
+; CHECK-NEXT:    xorb %dil, %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0xfe]
+; CHECK-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
@@ -437,8 +435,8 @@ define i1 @xorflag8rr(i8 %a, i8 %b) {
 ;
 ; NF-LABEL: xorflag8rr:
 ; NF:       # %bb.0:
-; NF-NEXT:    xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe]
-; NF-NEXT:    xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff]
+; NF-NEXT:    {nf} xorb %dil, %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0xfe]
+; NF-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index adb7319fe80b11..07d443d30374c7 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -24,25 +24,25 @@ define void @_Z1nv() local_unnamed_addr {
 ; CHECK-LABEL: _Z1nv:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq k at GOTPCREL(%rip), %rax
-; CHECK-NEXT:    movl 4(%rax), %edx
+; CHECK-NEXT:    movl 4(%rax), %ecx
 ; CHECK-NEXT:    movq c at GOTPCREL(%rip), %rax
-; CHECK-NEXT:    movswl (%rax), %ecx
+; CHECK-NEXT:    movswl (%rax), %esi
 ; CHECK-NEXT:    movq b at GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movswl (%rax), %edi
-; CHECK-NEXT:    movq a at GOTPCREL(%rip), %rsi
-; CHECK-NEXT:    movl (%rsi), %esi
+; CHECK-NEXT:    movq a at GOTPCREL(%rip), %rdx
+; CHECK-NEXT:    movl (%rdx), %edx
 ; CHECK-NEXT:    movq l at GOTPCREL(%rip), %r8
 ; CHECK-NEXT:    movl (%r8), %r8d
 ; CHECK-NEXT:    movl %r8d, %r9d
 ; CHECK-NEXT:    shll $7, %r9d
 ; CHECK-NEXT:    sarl $7, %r9d
 ; CHECK-NEXT:    negl %r9d
-; CHECK-NEXT:    testl %esi, %esi
-; CHECK-NEXT:    cmovel %esi, %r9d
-; CHECK-NEXT:    movzwl %dx, %r10d
-; CHECK-NEXT:    leal (%rcx,%r10,2), %ecx
-; CHECK-NEXT:    addl %edi, %ecx
-; CHECK-NEXT:    cmpl %r9d, %ecx
+; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    cmovel %edx, %r9d
+; CHECK-NEXT:    movzwl %cx, %r10d
+; CHECK-NEXT:    leal (%rsi,%r10,2), %esi
+; CHECK-NEXT:    addl %edi, %esi
+; CHECK-NEXT:    cmpl %r9d, %esi
 ; CHECK-NEXT:    sete %dil
 ; CHECK-NEXT:    testl $33554431, %r8d # imm = 0x1FFFFFF
 ; CHECK-NEXT:    sete %r8b
@@ -50,12 +50,12 @@ define void @_Z1nv() local_unnamed_addr {
 ; CHECK-NEXT:    movzbl %r8b, %edi
 ; CHECK-NEXT:    movq e at GOTPCREL(%rip), %r8
 ; CHECK-NEXT:    movw %di, (%r8)
-; CHECK-NEXT:    notl %ecx
-; CHECK-NEXT:    shrl $31, %ecx
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    notl %esi
+; CHECK-NEXT:    shrl $31, %esi
+; CHECK-NEXT:    addb %sil, %cl
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %esi
-; CHECK-NEXT:    movw %si, (%rax)
+; CHECK-NEXT:    sarl %cl, %edx
+; CHECK-NEXT:    movw %dx, (%rax)
 ; CHECK-NEXT:    retq
 entry:
   %bf.load = load i32, ptr getelementptr inbounds (%struct.m, ptr @k, i64 0, i32 0, i32 1), align 4
diff --git a/llvm/test/CodeGen/X86/fast-isel-fcmp.ll b/llvm/test/CodeGen/X86/fast-isel-fcmp.ll
index b9ef3154cd1c3a..b56e2c0faa439a 100644
--- a/llvm/test/CodeGen/X86/fast-isel-fcmp.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-fcmp.ll
@@ -9,7 +9,7 @@ define zeroext i1 @fcmp_oeq(float %x, float %y) {
 ; SDAG:       ## %bb.0:
 ; SDAG-NEXT:    cmpeqss %xmm1, %xmm0
 ; SDAG-NEXT:    movd %xmm0, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
@@ -327,7 +327,7 @@ define zeroext i1 @fcmp_une(float %x, float %y) {
 ; SDAG:       ## %bb.0:
 ; SDAG-NEXT:    cmpneqss %xmm1, %xmm0
 ; SDAG-NEXT:    movd %xmm0, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
@@ -553,7 +553,7 @@ define zeroext i1 @fcmp_oeq3(float %x) {
 ; SDAG-NEXT:    xorps %xmm1, %xmm1
 ; SDAG-NEXT:    cmpeqss %xmm0, %xmm1
 ; SDAG-NEXT:    movd %xmm1, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
@@ -1165,7 +1165,7 @@ define zeroext i1 @fcmp_une3(float %x) {
 ; SDAG-NEXT:    xorps %xmm1, %xmm1
 ; SDAG-NEXT:    cmpneqss %xmm0, %xmm1
 ; SDAG-NEXT:    movd %xmm1, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
index 04fce7badb9514..76dd49eb72c956 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
@@ -763,7 +763,7 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -816,7 +816,7 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
@@ -1777,7 +1777,7 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -1830,7 +1830,7 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
@@ -2957,7 +2957,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -3017,7 +3017,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
@@ -4288,7 +4288,7 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -4349,7 +4349,7 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
index fefc92c313511b..8e705121f6976d 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
@@ -720,7 +720,7 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $44, %esp
@@ -768,7 +768,7 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $32, %esp
@@ -1644,7 +1644,7 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $44, %esp
@@ -1692,7 +1692,7 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $32, %esp
@@ -2723,7 +2723,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $44, %esp
@@ -2778,7 +2778,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $32, %esp
@@ -3938,7 +3938,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -3992,7 +3992,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $48, %esp
diff --git a/llvm/test/CodeGen/X86/isel-and.ll b/llvm/test/CodeGen/X86/isel-and.ll
index 8db8060a0b9c71..86538ccb837925 100644
--- a/llvm/test/CodeGen/X86/isel-and.ll
+++ b/llvm/test/CodeGen/X86/isel-and.ll
@@ -30,7 +30,7 @@ define i1 @and_i1(i1 %a, i1 %b) {
 ; SDAG-X64-LABEL: and_i1:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    andl %esi, %eax
+; SDAG-X64-NEXT:    andb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
@@ -74,7 +74,7 @@ define i8 @and_i8(i8 %a, i8 %b) {
 ; SDAG-X64-LABEL: and_i8:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    andl %esi, %eax
+; SDAG-X64-NEXT:    andb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/isel-or.ll b/llvm/test/CodeGen/X86/isel-or.ll
index ad11e4e5bd3b15..ffa805014e8cbc 100644
--- a/llvm/test/CodeGen/X86/isel-or.ll
+++ b/llvm/test/CodeGen/X86/isel-or.ll
@@ -30,7 +30,7 @@ define i1 @or_i1(i1 %a, i1 %b) {
 ; SDAG-X64-LABEL: or_i1:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    orl %esi, %eax
+; SDAG-X64-NEXT:    orb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
@@ -75,7 +75,7 @@ define i8 @or_i8(i8 %a, i8 %b) {
 ; SDAG-X64-LABEL: or_i8:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    orl %esi, %eax
+; SDAG-X64-NEXT:    orb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/isel-xor.ll b/llvm/test/CodeGen/X86/isel-xor.ll
index 979993cb7bdcf4..5fd3817626edc6 100644
--- a/llvm/test/CodeGen/X86/isel-xor.ll
+++ b/llvm/test/CodeGen/X86/isel-xor.ll
@@ -30,7 +30,7 @@ define i1 @xor_i1(i1 %a, i1 %b) {
 ; SDAG-X64-LABEL: xor_i1:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    xorl %esi, %eax
+; SDAG-X64-NEXT:    xorb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
@@ -75,7 +75,7 @@ define i8 @xor_i8(i8 %a, i8 %b) {
 ; SDAG-X64-LABEL: xor_i8:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    xorl %esi, %eax
+; SDAG-X64-NEXT:    xorb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index 9eaa65442a727f..e807d806b165b5 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -388,11 +388,10 @@ define i32 @func_test1(i32 %p1) nounwind uwtable {
 ; CHECK-LABEL: func_test1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl b, %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    setb %cl
 ; CHECK-NEXT:    movl a, %eax
-; CHECK-NEXT:    testl %eax, %ecx
+; CHECK-NEXT:    testb %al, %cl
 ; CHECK-NEXT:    je .LBB18_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    decl %eax
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc15617d5..3220a95a6134ce 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -104,7 +104,7 @@ define i1 @pow2_srl(i32 %x, i32 %y) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    shrl $20, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %yy = and i32 %y, 7
diff --git a/llvm/test/CodeGen/X86/large-pic-string.ll b/llvm/test/CodeGen/X86/large-pic-string.ll
index 5e7cdbb93dc883..2a2c46e501175f 100644
--- a/llvm/test/CodeGen/X86/large-pic-string.ll
+++ b/llvm/test/CodeGen/X86/large-pic-string.ll
@@ -12,7 +12,7 @@ define void @pr38385() {
 ; CHECK-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
 ; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movabsq $.L.str at GOTOFF, %rax
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    addb %al, %cl
 ; CHECK-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    retq
   %p = alloca i8, align 1
diff --git a/llvm/test/CodeGen/X86/logic-shift.ll b/llvm/test/CodeGen/X86/logic-shift.ll
index 96e63d1122ec92..7eac8534d06b20 100644
--- a/llvm/test/CodeGen/X86/logic-shift.ll
+++ b/llvm/test/CodeGen/X86/logic-shift.ll
@@ -6,7 +6,7 @@ define i8 @or_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    orb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil
 ; CHECK-NEXT:    orb %dil, %al
@@ -178,7 +178,7 @@ define i8 @or_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    orb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    orb %dil, %al
@@ -290,7 +290,7 @@ define i8 @xor_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil
 ; CHECK-NEXT:    xorb %dil, %al
@@ -462,7 +462,7 @@ define i8 @xor_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    xorb %dil, %al
@@ -574,7 +574,7 @@ define i8 @and_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    andb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil
 ; CHECK-NEXT:    andb %dil, %al
@@ -746,7 +746,7 @@ define i8 @and_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    andb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    andb %dil, %al
@@ -902,9 +902,8 @@ define i8 @or_fshl_commute3(i8 %x, i8 %y) {
 ; CHECK-LABEL: or_fshl_commute3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %edi, %esi
 ; CHECK-NEXT:    shlb $5, %sil
-; CHECK-NEXT:    shrb $3, %al
+; CHECK-NEXT:    rolb $5, %al
 ; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
@@ -979,10 +978,9 @@ define i16 @or_fshr_commute2(i16 %x, i16 %y) {
 define i8 @or_fshr_commute3(i8 %x, i8 %y) {
 ; CHECK-LABEL: or_fshr_commute3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    orl %edi, %esi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    shrb $6, %sil
-; CHECK-NEXT:    leal (,%rdi,4), %eax
+; CHECK-NEXT:    rolb $2, %al
 ; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/lzcnt.ll b/llvm/test/CodeGen/X86/lzcnt.ll
index b0004019734168..3cdb9eae354703 100644
--- a/llvm/test/CodeGen/X86/lzcnt.ll
+++ b/llvm/test/CodeGen/X86/lzcnt.ll
@@ -13,7 +13,7 @@ define i8 @t1(i8 %x) nounwind  {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    lzcntl %eax, %eax
-; X86-NEXT:    addl $-24, %eax
+; X86-NEXT:    addb $-24, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -21,7 +21,7 @@ define i8 @t1(i8 %x) nounwind  {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl %dil, %eax
 ; X32-NEXT:    lzcntl %eax, %eax
-; X32-NEXT:    addl $-24, %eax
+; X32-NEXT:    addb $-24, %al
 ; X32-NEXT:    # kill: def $al killed $al killed $eax
 ; X32-NEXT:    retq
 ;
@@ -29,7 +29,7 @@ define i8 @t1(i8 %x) nounwind  {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    lzcntl %eax, %eax
-; X64-NEXT:    addl $-24, %eax
+; X64-NEXT:    addb $-24, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 false )
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f4a0207dafde7c..77ca673d5b82ae 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -18,7 +18,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB0_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -91,7 +91,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB0_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -223,7 +223,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -304,7 +304,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB1_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -378,7 +378,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -449,7 +449,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB1_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -612,7 +612,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB2_1
@@ -688,7 +688,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB2_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -763,7 +763,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -836,7 +836,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -988,7 +988,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    movmskps %xmm3, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1027,7 +1027,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE4-NEXT:    movmskps %xmm3, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB3_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1127,7 +1127,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB4_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1170,7 +1170,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packusdw %xmm3, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB4_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1210,7 +1210,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB4_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1252,7 +1252,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB4_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1358,7 +1358,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB5_1
@@ -1401,7 +1401,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE4-NEXT:    movmskps %xmm3, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB5_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1441,7 +1441,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB5_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1483,7 +1483,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1586,7 +1586,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB6_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1609,7 +1609,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm2
 ; SSE4-NEXT:    movmskpd %xmm2, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB6_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1690,7 +1690,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB7_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1715,7 +1715,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm2
 ; SSE4-NEXT:    movmskpd %xmm2, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB7_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1738,7 +1738,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB7_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -1813,7 +1813,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB8_1
@@ -1836,7 +1836,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm2
 ; SSE4-NEXT:    movmskpd %xmm2, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB8_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1858,7 +1858,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB8_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -3281,7 +3281,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB11_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -3358,7 +3358,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB11_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -3427,7 +3427,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB11_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -3494,7 +3494,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB11_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -3658,7 +3658,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB12_1
@@ -3730,7 +3730,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB12_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -3801,7 +3801,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB12_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -3871,7 +3871,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB12_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4030,7 +4030,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB13_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4070,7 +4070,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB13_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4106,7 +4106,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB13_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -4204,7 +4204,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB14_1
@@ -4244,7 +4244,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB14_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4280,7 +4280,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB14_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -6210,7 +6210,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE2-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB17_1
@@ -6276,7 +6276,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE4-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB17_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -6341,7 +6341,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpmovmskb %xmm1, %eax
-; AVX-NEXT:    notl %eax
+; AVX-NEXT:    notb %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB17_1
 ; AVX-NEXT:  # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 487f7298f442c2..97431310604edb 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -114,7 +114,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    je .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %cond.store
@@ -221,7 +221,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB0_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -495,7 +495,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -605,7 +605,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB1_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -694,7 +694,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -773,7 +773,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB1_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1043,7 +1043,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB2_1
@@ -1147,7 +1147,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB2_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1237,7 +1237,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1317,7 +1317,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1531,7 +1531,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1587,7 +1587,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm4
 ; SSE4-NEXT:    movmskps %xmm4, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB3_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1763,7 +1763,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB4_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1821,7 +1821,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm1, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm4
 ; SSE4-NEXT:    movmskps %xmm4, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB4_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1869,7 +1869,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB4_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1915,7 +1915,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB4_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -2078,7 +2078,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm4, %eax
 ; SSE2-NEXT:    jne .LBB5_1
@@ -2137,7 +2137,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packsswb %xmm1, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm4
 ; SSE4-NEXT:    movmskps %xmm4, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB5_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2186,7 +2186,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB5_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -2233,7 +2233,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -2368,7 +2368,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    movmskpd %xmm2, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB6_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2400,7 +2400,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB6_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2529,7 +2529,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    movmskpd %xmm2, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB7_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2563,7 +2563,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB7_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2592,7 +2592,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB7_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -2698,7 +2698,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    movmskpd %xmm0, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm3, %ecx
 ; SSE2-NEXT:    jne .LBB8_1
@@ -2730,7 +2730,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB8_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2758,7 +2758,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB8_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -4174,7 +4174,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB11_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4249,7 +4249,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB11_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4317,7 +4317,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB11_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4384,7 +4384,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB11_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4551,7 +4551,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB12_1
@@ -4620,7 +4620,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB12_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4689,7 +4689,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB12_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4757,7 +4757,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB12_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4922,7 +4922,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB13_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4962,7 +4962,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB13_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4998,7 +4998,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB13_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -5101,7 +5101,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB14_1
@@ -5142,7 +5142,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packsswb %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB14_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -5179,7 +5179,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB14_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -7105,7 +7105,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE2-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB17_1
@@ -7171,7 +7171,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE4-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB17_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -7236,7 +7236,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpmovmskb %xmm1, %eax
-; AVX-NEXT:    notl %eax
+; AVX-NEXT:    notb %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB17_1
 ; AVX-NEXT:  # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 498f250f11c690..b2a73a25ffc0a2 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -56,7 +56,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    je .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %cond.store
@@ -156,7 +156,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB0_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -368,7 +368,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -471,7 +471,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB1_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -560,7 +560,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -638,7 +638,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB1_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -846,7 +846,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm1, %ecx
 ; SSE2-NEXT:    jne .LBB2_1
@@ -943,7 +943,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB2_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1033,7 +1033,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1112,7 +1112,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1292,7 +1292,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    movmskps %xmm3, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1345,7 +1345,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[0,2]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm6
 ; SSE4-NEXT:    movmskps %xmm6, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB3_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1490,7 +1490,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB4_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1545,7 +1545,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packusdw %xmm5, %xmm5
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm6
 ; SSE4-NEXT:    movmskps %xmm6, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB4_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1595,7 +1595,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB4_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1641,7 +1641,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB4_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1771,7 +1771,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    movmskps %xmm3, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    jne .LBB5_1
@@ -1827,7 +1827,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packuswb %xmm5, %xmm5
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm6
 ; SSE4-NEXT:    movmskps %xmm6, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB5_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1878,7 +1878,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB5_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1925,7 +1925,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -2042,7 +2042,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB6_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2070,7 +2070,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB6_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2173,7 +2173,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB7_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2203,7 +2203,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB7_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2229,7 +2229,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB7_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -2316,7 +2316,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    movmskpd %xmm0, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm3, %ecx
 ; SSE2-NEXT:    jne .LBB8_1
@@ -2344,7 +2344,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB8_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2369,7 +2369,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB8_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -3868,7 +3868,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB11_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -3946,7 +3946,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB11_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4017,7 +4017,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB11_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4086,7 +4086,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB11_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4265,7 +4265,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm4, %ecx
 ; SSE2-NEXT:    jne .LBB12_1
@@ -4337,7 +4337,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB12_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4409,7 +4409,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB12_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4479,7 +4479,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB12_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4648,7 +4648,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB13_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4689,7 +4689,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packusdw %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB13_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4726,7 +4726,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB13_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4764,7 +4764,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB13_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4873,7 +4873,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm3, %eax
 ; SSE2-NEXT:    jne .LBB14_1
@@ -4915,7 +4915,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packuswb %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB14_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4953,7 +4953,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB14_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4992,7 +4992,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB14_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -6953,7 +6953,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE2-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB17_1
@@ -7020,7 +7020,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE4-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB17_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -7086,7 +7086,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpmovmskb %xmm1, %eax
-; AVX-NEXT:    notl %eax
+; AVX-NEXT:    notb %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB17_1
 ; AVX-NEXT:  # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
index c997d314a50ae4..eed2b2fa01ad32 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
@@ -359,8 +359,8 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
 ; CHECK-LABEL: test_store_sptr32_trunc_i1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    andb $1, %cl
 ; CHECK-NEXT:    movb %cl, (%eax)
 ; CHECK-NEXT:    retl
 ;
@@ -368,8 +368,8 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-O0-NEXT:    andl $1, %ecx
 ; CHECK-O0-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-O0-NEXT:    andb $1, %cl
 ; CHECK-O0-NEXT:    movb %cl, (%eax)
 ; CHECK-O0-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
index 67539b07f5716c..dcbfde8285e5d9 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
@@ -281,15 +281,15 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
 ; CHECK-LABEL: test_store_sptr32_trunc_i1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    andb $1, %dl
 ; CHECK-NEXT:    movb %dl, (%rax)
 ; CHECK-NEXT:    retq
 ;
 ; CHECK-O0-LABEL: test_store_sptr32_trunc_i1:
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movslq %ecx, %rax
-; CHECK-O0-NEXT:    andl $1, %edx
 ; CHECK-O0-NEXT:    movb %dl, %cl
+; CHECK-O0-NEXT:    andb $1, %cl
 ; CHECK-O0-NEXT:    movb %cl, (%rax)
 ; CHECK-O0-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/or-with-overflow.ll b/llvm/test/CodeGen/X86/or-with-overflow.ll
index b3ffa209bc7004..c7b528a88ca0c2 100644
--- a/llvm/test/CodeGen/X86/or-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/or-with-overflow.ll
@@ -46,8 +46,8 @@ define i8 @or_i8_rr(i8 zeroext %0, i8 zeroext %1) {
 ;
 ; X64-LABEL: or_i8_rr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    orl %edi, %eax
+; X64-NEXT:    orb %dil, %sil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/parity-vec.ll b/llvm/test/CodeGen/X86/parity-vec.ll
index f9a2411465141c..cb525a680b1a7f 100644
--- a/llvm/test/CodeGen/X86/parity-vec.ll
+++ b/llvm/test/CodeGen/X86/parity-vec.ll
@@ -16,7 +16,7 @@ define i1 @noncanonical_parity(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %r = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
@@ -36,7 +36,7 @@ define i1 @canonical_parity(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %i1 = bitcast <16 x i1> %x to i16
@@ -65,7 +65,7 @@ define i1 @canonical_parity_noncanonical_pred(<16 x i1> %x) {
 ; NOPOPCNT-NEXT:    andl $3855, %ecx # imm = 0xF0F
 ; NOPOPCNT-NEXT:    movl %ecx, %eax
 ; NOPOPCNT-NEXT:    shrl $8, %eax
-; NOPOPCNT-NEXT:    addl %ecx, %eax
+; NOPOPCNT-NEXT:    addb %cl, %al
 ; NOPOPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; NOPOPCNT-NEXT:    retq
 ;
@@ -97,8 +97,8 @@ define i1 @noncanonical_nonparity(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
-; POPCNT-NEXT:    xorb $1, %al
+; POPCNT-NEXT:    notb %al
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %r.inv = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
@@ -142,8 +142,8 @@ define i1 @canonical_nonparity_noncanonical_pred(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
-; POPCNT-NEXT:    xorb $1, %al
+; POPCNT-NEXT:    notb %al
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %i1 = bitcast <16 x i1> %x to i16
diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 420f5ba5ab4336..2c3199db6ec65b 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -318,14 +318,14 @@ define i8 @parity_32_trunc(i32 %x) {
 ; X86-POPCNT-LABEL: parity_32_trunc:
 ; X86-POPCNT:       # %bb.0:
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    andb $1, %al
 ; X86-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-POPCNT-NEXT:    retl
 ;
 ; X64-POPCNT-LABEL: parity_32_trunc:
 ; X64-POPCNT:       # %bb.0:
 ; X64-POPCNT-NEXT:    popcntl %edi, %eax
-; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    andb $1, %al
 ; X64-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-POPCNT-NEXT:    retq
   %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -381,9 +381,8 @@ define i16 @parity_16_mask255(i16 %x) {
 define i16 @parity_16_mask15(i16 %x) {
 ; X86-LABEL: parity_16_mask15:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testb $15, %cl
+; X86-NEXT:    testb $15, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setnp %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr14088.ll b/llvm/test/CodeGen/X86/pr14088.ll
index 83bf13280f94ae..32a5155c41b907 100644
--- a/llvm/test/CodeGen/X86/pr14088.ll
+++ b/llvm/test/CodeGen/X86/pr14088.ll
@@ -29,10 +29,10 @@ define i32 @f(i1 %foo, ptr %tm_year2, ptr %bar, i16 %zed, i32 %zed2) {
 ; CHECK-NEXT:    movswq %ax, %rax
 ; CHECK-NEXT:    imulq $1717986919, %rax, %rax # imm = 0x66666667
 ; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    shrq $63, %rcx
-; CHECK-NEXT:    shrq $34, %rax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    movb %al, (%rdx)
+; CHECK-NEXT:    shrq $34, %rcx
+; CHECK-NEXT:    shrq $63, %rax
+; CHECK-NEXT:    addb %al, %cl
+; CHECK-NEXT:    movb %cl, (%rdx)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:  .LBB0_2: # %return
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll
index 5083eac71dce0c..89ef4cb2201e4b 100644
--- a/llvm/test/CodeGen/X86/pr15267.ll
+++ b/llvm/test/CodeGen/X86/pr15267.ll
@@ -84,62 +84,61 @@ define <16 x i4> @test4(ptr %in) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl $4, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    shrb $4, %cl
 ; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    andl $15, %edx
+; CHECK-NEXT:    andb $15, %dl
 ; CHECK-NEXT:    vmovd %edx, %xmm0
 ; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $8, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $12, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $20, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $24, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $28, %ecx
 ; CHECK-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $36, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $40, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $44, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $48, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $52, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $56, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    shrq $60, %rax
 ; CHECK-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr35761.ll b/llvm/test/CodeGen/X86/pr35761.ll
index 5661b6775ab9d9..804366d93203b8 100644
--- a/llvm/test/CodeGen/X86/pr35761.ll
+++ b/llvm/test/CodeGen/X86/pr35761.ll
@@ -8,10 +8,10 @@
 define dso_local void @PR35761(i32 %call) {
 ; CHECK-LABEL: PR35761:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzbl x(%rip), %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    movzbl y(%rip), %ecx
-; CHECK-NEXT:    xorl $255, %ecx
+; CHECK-NEXT:    movzbl y(%rip), %eax
+; CHECK-NEXT:    xorl $255, %eax
+; CHECK-NEXT:    movzbl x(%rip), %ecx
+; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    movw %cx, z(%rip)
 ; CHECK-NEXT:    movb $0, z+2(%rip)
diff --git a/llvm/test/CodeGen/X86/pr40539.ll b/llvm/test/CodeGen/X86/pr40539.ll
index 56d80a025fa084..a1df2d8f69238e 100644
--- a/llvm/test/CodeGen/X86/pr40539.ll
+++ b/llvm/test/CodeGen/X86/pr40539.ll
@@ -18,7 +18,7 @@ define zeroext i1 @_Z9test_log2v() {
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    cmpeqss (%esp), %xmm0
 ; CHECK-NEXT:    movd %xmm0, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    popl %ecx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
index 65e679063ccb09..6edd9a36b6185e 100644
--- a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
+++ b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
@@ -15,7 +15,7 @@ define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
 ;
 ; CHECK32-LABEL: f32_bzhi:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    retl
 entry:
@@ -34,7 +34,7 @@ define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
 ;
 ; CHECK32-LABEL: f32_bzhi_partial:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab6178c..f8a0698208ebda 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -597,7 +597,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
@@ -648,7 +648,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
@@ -710,7 +710,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
@@ -760,7 +760,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll
index c98aae7fbf4059..3d9105922e3165 100644
--- a/llvm/test/CodeGen/X86/setcc-logic.ll
+++ b/llvm/test/CodeGen/X86/setcc-logic.ll
@@ -454,9 +454,9 @@ define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) nounwind {
 define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
 ; CHECK-LABEL: and_eq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    xorl %ecx, %edx
-; CHECK-NEXT:    orb %dl, %dil
+; CHECK-NEXT:    xorb %sil, %dil
+; CHECK-NEXT:    xorb %cl, %dl
+; CHECK-NEXT:    orb %dil, %dl
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %cmp1 = icmp eq i8 %a, %b
@@ -468,9 +468,9 @@ define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
 define zeroext i1 @or_ne(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
 ; CHECK-LABEL: or_ne:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    xorl %ecx, %edx
-; CHECK-NEXT:    orb %dl, %dil
+; CHECK-NEXT:    xorb %sil, %dil
+; CHECK-NEXT:    xorb %cl, %dl
+; CHECK-NEXT:    orb %dil, %dl
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
   %cmp1 = icmp ne i8 %a, %b
diff --git a/llvm/test/CodeGen/X86/setoeq.ll b/llvm/test/CodeGen/X86/setoeq.ll
index f0addf4b64599d..d30e9395af0458 100644
--- a/llvm/test/CodeGen/X86/setoeq.ll
+++ b/llvm/test/CodeGen/X86/setoeq.ll
@@ -9,7 +9,7 @@ define zeroext i8 @t(double %x) nounwind readnone {
 ; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; CHECK-NEXT:    cmpeqsd %xmm0, %xmm1
 ; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retl
 entry:
@@ -28,7 +28,7 @@ define zeroext i8 @u(double %x) nounwind readnone {
 ; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; CHECK-NEXT:    cmpneqsd %xmm0, %xmm1
 ; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 220c2e5012ea49..6fe45b37f030e8 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2476,14 +2476,14 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
 ; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm3, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm1, %ymm1
 ; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm3
-; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm0
-; CHECK-AVX512VL-NEXT:    vpternlogq $14, %ymm3, %ymm2, %ymm0
+; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm0
+; CHECK-AVX512VL-NEXT:    vpternlogq $50, %ymm2, %ymm1, %ymm0
 ; CHECK-AVX512VL-NEXT:    retq
   %rem = srem <32 x i8> %x, <i8 13, i8 5, i8 19, i8 34, i8 2, i8 8, i8 2, i8 88, i8 62, i8 62, i8 5, i8 7, i8 97, i8 2, i8 3, i8 60, i8 3, i8 87, i8 7, i8 6, i8 84, i8 -128, i8 127, i8 56, i8 114, i8 1, i8 50, i8 7, i8 2, i8 8, i8 97, i8 117>
   %cmp = icmp ne <32 x i8> %rem, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
index f7e053d384c99d..f624122be8374e 100644
--- a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
+++ b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
@@ -27,7 +27,7 @@ define i1 @test_spill_slot_size(i1 %a1, i2 %a2, i7 %a7, i8 %a8, i9 %a9, i15 %a15
 ; CHECK-NEXT:    andb $3, %sil
 ; CHECK-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %ebx, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    andb $127, %dl
 ; CHECK-NEXT:    movb %dl, {{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
index 9a8719f9a64fa5..cecc3efd86282f 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $60, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $60, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $60, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $60, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
index 2ea74f39423872..074b6f8591d1e1 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
@@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $15, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $15, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
index eb6accd3e623b0..a9cf67789288c2 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
@@ -104,19 +104,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $15, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $15, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194bedc4e1ca..27fa5e68f13052 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,11 +16,10 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notb %al
-; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    orb %dil, %al
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorb %sil, %al
+; CHECK-NEXT:    andb %dl, %al
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
@@ -37,32 +36,28 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
-; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    notb %r9b
-; CHECK-BASELINE-NEXT:    andb %cl, %r9b
-; CHECK-BASELINE-NEXT:    andb %dl, %al
-; CHECK-BASELINE-NEXT:    orb %dil, %al
-; CHECK-BASELINE-NEXT:    orb %sil, %r9b
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    andb %r8b, %al
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
+; CHECK-BASELINE-NEXT:    andb %r9b, %sil
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
-; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    notb %r9b
-; CHECK-SSE1-NEXT:    andb %cl, %r9b
-; CHECK-SSE1-NEXT:    andb %dl, %al
-; CHECK-SSE1-NEXT:    orb %dil, %al
-; CHECK-SSE1-NEXT:    orb %sil, %r9b
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    andb %r8b, %al
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
+; CHECK-SSE1-NEXT:    andb %r9b, %sil
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i8:
@@ -111,7 +106,7 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
@@ -135,7 +130,7 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r11b, %dl
@@ -178,7 +173,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
@@ -199,7 +194,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r10b, %dl
@@ -2277,9 +2272,9 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %edx, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
+; CHECK-NEXT:    andb %dl, %al
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %n0 = xor <1 x i8> %x, %y
@@ -2296,12 +2291,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
-; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
+; CHECK-BASELINE-NEXT:    andb %r9b, %sil
+; CHECK-BASELINE-NEXT:    andb %r8b, %al
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
@@ -2309,12 +2304,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-SSE1-LABEL: in_v2i8:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
-; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
+; CHECK-SSE1-NEXT:    andb %r9b, %sil
+; CHECK-SSE1-NEXT:    andb %r8b, %al
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
@@ -2362,7 +2357,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
 ; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
 ; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
@@ -2386,7 +2381,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r11b, %dl
 ; CHECK-SSE1-NEXT:    xorb %r10b, %cl
 ; CHECK-SSE1-NEXT:    xorb %dil, %r8b
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index 30202701fdb8c8..f70f586d0ecdb1 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -1355,10 +1355,8 @@ define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
-; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    notl %eax
-; SSE2-NEXT:    cmpb $-1, %al
+; SSE2-NEXT:    testl %eax, %eax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index 2df39d69dbb751..8f441fee670841 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -1195,7 +1195,7 @@ define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
 ; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    xorb $-1, %al
+; SSE2-NEXT:    cmpb $-1, %al
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 9cd0f4d12e15ab..4d0d0548d44f90 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -1068,10 +1068,10 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
+; SSE2-NEXT:    orb %al, %cl
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    orl %ecx, %eax
+; SSE2-NEXT:    orb %cl, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    je .LBB27_2
 ; SSE2-NEXT:  # %bb.1:
@@ -1088,9 +1088,9 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE41-NEXT:    pxor %xmm0, %xmm1
 ; SSE41-NEXT:    pextrd $1, %xmm1, %eax
 ; SSE41-NEXT:    movd %xmm1, %ecx
-; SSE41-NEXT:    orl %eax, %ecx
+; SSE41-NEXT:    orb %al, %cl
 ; SSE41-NEXT:    pextrd $2, %xmm1, %eax
-; SSE41-NEXT:    orl %ecx, %eax
+; SSE41-NEXT:    orb %cl, %al
 ; SSE41-NEXT:    testb $1, %al
 ; SSE41-NEXT:    je .LBB27_2
 ; SSE41-NEXT:  # %bb.1:
@@ -1107,9 +1107,9 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX1OR2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX1OR2-NEXT:    vmovd %xmm0, %ecx
-; AVX1OR2-NEXT:    orl %eax, %ecx
+; AVX1OR2-NEXT:    orb %al, %cl
 ; AVX1OR2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1OR2-NEXT:    orl %ecx, %eax
+; AVX1OR2-NEXT:    orb %cl, %al
 ; AVX1OR2-NEXT:    testb $1, %al
 ; AVX1OR2-NEXT:    je .LBB27_2
 ; AVX1OR2-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/X86/xor-with-overflow.ll b/llvm/test/CodeGen/X86/xor-with-overflow.ll
index 5d22302d39add9..e025989156e9e3 100644
--- a/llvm/test/CodeGen/X86/xor-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/xor-with-overflow.ll
@@ -46,8 +46,8 @@ define i8 @xor_i8_rr(i8 zeroext %0, i8 zeroext %1) {
 ;
 ; X64-LABEL: xor_i8_rr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    xorl %edi, %eax
+; X64-NEXT:    xorb %dil, %sil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index 39e2d6f1acca45..9170d8f7b45de9 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -312,22 +312,18 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_1: # %for.body
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movzbl -3(%rdi,%rax), %ecx
-; X64-NEXT:    movzbl -3(%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, -3(%rdx,%rax)
-; X64-NEXT:    movzbl -2(%rdi,%rax), %ecx
-; X64-NEXT:    movzbl -2(%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, -2(%rdx,%rax)
-; X64-NEXT:    movzbl -1(%rdi,%rax), %ecx
-; X64-NEXT:    movzbl -1(%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, -1(%rdx,%rax)
-; X64-NEXT:    movzbl (%rdi,%rax), %ecx
-; X64-NEXT:    movzbl (%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, (%rdx,%rax)
+; X64-NEXT:    movzbl -3(%rsi,%rax), %ecx
+; X64-NEXT:    addb -3(%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, -3(%rdx,%rax)
+; X64-NEXT:    movzbl -2(%rsi,%rax), %ecx
+; X64-NEXT:    addb -2(%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, -2(%rdx,%rax)
+; X64-NEXT:    movzbl -1(%rsi,%rax), %ecx
+; X64-NEXT:    addb -1(%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, -1(%rdx,%rax)
+; X64-NEXT:    movzbl (%rsi,%rax), %ecx
+; X64-NEXT:    addb (%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, (%rdx,%rax)
 ; X64-NEXT:    addq $4, %rax
 ; X64-NEXT:    cmpl $403, %eax # imm = 0x193
 ; X64-NEXT:    jne .LBB3_1
@@ -337,7 +333,6 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou
 ; X32-LABEL: foldedidx:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    movl $3, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -346,28 +341,23 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB3_1: # %for.body
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movzbl -3(%esi,%eax), %edi
 ; X32-NEXT:    movzbl -3(%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb -3(%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, -3(%ecx,%eax)
-; X32-NEXT:    movzbl -2(%esi,%eax), %edi
 ; X32-NEXT:    movzbl -2(%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb -2(%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, -2(%ecx,%eax)
-; X32-NEXT:    movzbl -1(%esi,%eax), %edi
 ; X32-NEXT:    movzbl -1(%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb -1(%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, -1(%ecx,%eax)
-; X32-NEXT:    movzbl (%esi,%eax), %edi
 ; X32-NEXT:    movzbl (%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb (%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, (%ecx,%eax)
 ; X32-NEXT:    addl $4, %eax
 ; X32-NEXT:    cmpl $403, %eax # imm = 0x193
 ; X32-NEXT:    jne .LBB3_1
 ; X32-NEXT:  # %bb.2: # %for.end
 ; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 entry:

>From c685bc52201f7b7d6c58b0a38d7d968c245d5f19 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 20:05:19 -0700
Subject: [PATCH 35/47] Update even more tests

---
 llvm/test/CodeGen/X86/ctpop-mask.ll       |  16 +-
 llvm/test/CodeGen/X86/fpenv.ll            |  13 +-
 llvm/test/CodeGen/X86/funnel-shift.ll     |  26 +--
 llvm/test/CodeGen/X86/shift-amount-mod.ll | 172 ++++++++---------
 llvm/test/CodeGen/X86/shift-i128.ll       | 221 ++++++++++------------
 5 files changed, 217 insertions(+), 231 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index a43dba94d30c78..cbb0ae719e5aa3 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -148,10 +148,9 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone {
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask3:
 ; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NO-POPCOUNT-NEXT:    andl $14, %ecx
+; X86-NO-POPCOUNT-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NO-POPCOUNT-NEXT:    andb $14, %cl
 ; X86-NO-POPCOUNT-NEXT:    movl $59796, %eax # imm = 0xE994
-; X86-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-POPCOUNT-NEXT:    shrl %cl, %eax
 ; X86-NO-POPCOUNT-NEXT:    andl $3, %eax
 ; X86-NO-POPCOUNT-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -160,7 +159,7 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone {
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask3:
 ; X64-NO-POPCOUNT:       # %bb.0:
 ; X64-NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; X64-NO-POPCOUNT-NEXT:    andl $14, %ecx
+; X64-NO-POPCOUNT-NEXT:    andb $14, %cl
 ; X64-NO-POPCOUNT-NEXT:    movl $59796, %eax # imm = 0xE994
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-POPCOUNT-NEXT:    shrl %cl, %eax
@@ -201,7 +200,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    andl $15, %edi
+; X64-NO-POPCOUNT-NEXT:    andb $15, %dil
 ; X64-NO-POPCOUNT-NEXT:    leal (,%rdi,4), %ecx
 ; X64-NO-POPCOUNT-NEXT:    movabsq $4841987667533046032, %rax # imm = 0x4332322132212110
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -241,9 +240,10 @@ define i32 @ctpop_shifted_mask4(i32 %x) nounwind readnone {
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; X64-NO-POPCOUNT-NEXT:    shrl $7, %ecx
-; X64-NO-POPCOUNT-NEXT:    andl $60, %ecx
+; X64-NO-POPCOUNT-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NO-POPCOUNT-NEXT:    shrl $9, %edi
+; X64-NO-POPCOUNT-NEXT:    andb $15, %dil
+; X64-NO-POPCOUNT-NEXT:    leal (,%rdi,4), %ecx
 ; X64-NO-POPCOUNT-NEXT:    movabsq $4841987667533046032, %rax # imm = 0x4332322132212110
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-POPCOUNT-NEXT:    shrq %cl, %rax
diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll
index c79e19f07cda54..42a753cc4ee4c3 100644
--- a/llvm/test/CodeGen/X86/fpenv.ll
+++ b/llvm/test/CodeGen/X86/fpenv.ll
@@ -185,8 +185,9 @@ define void @func_05(i32 %x) nounwind {
 ; X86-NOSSE-LABEL: func_05:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    leal 4(%eax,%eax), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    addl %ecx, %ecx
+; X86-NOSSE-NEXT:    addb $4, %cl
 ; X86-NOSSE-NEXT:    movl $201, %eax
 ; X86-NOSSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOSSE-NEXT:    shll %cl, %eax
@@ -203,8 +204,9 @@ define void @func_05(i32 %x) nounwind {
 ; X86-SSE-LABEL: func_05:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    leal 4(%eax,%eax), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    addl %ecx, %ecx
+; X86-SSE-NEXT:    addb $4, %cl
 ; X86-SSE-NEXT:    movl $201, %eax
 ; X86-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SSE-NEXT:    shll %cl, %eax
@@ -227,7 +229,8 @@ define void @func_05(i32 %x) nounwind {
 ; X64-LABEL: func_05:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal 4(%rdi,%rdi), %ecx
+; X64-NEXT:    leal (%rdi,%rdi), %ecx
+; X64-NEXT:    addb $4, %cl
 ; X64-NEXT:    movl $201, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index a464d78f9af38c..c9e9bda1dcd3b4 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -182,8 +182,9 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
 ; X64-AVX-NEXT:    mulq %rdx
 ; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX-NEXT:    subl %eax, %ecx
+; X64-AVX-NEXT:    shll $2, %eax
+; X64-AVX-NEXT:    addb %dl, %al
+; X64-AVX-NEXT:    subb %al, %cl
 ; X64-AVX-NEXT:    shlq $27, %rsi
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-AVX-NEXT:    shldq %cl, %rsi, %rdi
@@ -349,9 +350,10 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
 ; X64-AVX-NEXT:    mulq %rdx
 ; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX-NEXT:    subl %eax, %ecx
-; X64-AVX-NEXT:    addl $27, %ecx
+; X64-AVX-NEXT:    shll $2, %eax
+; X64-AVX-NEXT:    addb %dl, %al
+; X64-AVX-NEXT:    subb %al, %cl
+; X64-AVX-NEXT:    addb $27, %cl
 ; X64-AVX-NEXT:    shlq $27, %rsi
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-AVX-NEXT:    shrdq %cl, %rdi, %rsi
@@ -437,16 +439,15 @@ define i32 @fshl_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef0_msk:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    andl $7, %ecx
-; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andb $7, %cl
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-AVX-LABEL: fshl_i32_undef0_msk:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movl %esi, %ecx
-; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    andb $7, %cl
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-AVX-NEXT:    shldl %cl, %edi, %eax
 ; X64-AVX-NEXT:    retq
@@ -694,16 +695,15 @@ define i32 @fshr_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef1_msk:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    andl $7, %ecx
-; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andb $7, %cl
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-AVX-LABEL: fshr_i32_undef1_msk:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movl %esi, %ecx
-; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    andb $7, %cl
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
 ; X64-AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll
index 9f7ac748c47e16..c212e028b6c90e 100644
--- a/llvm/test/CodeGen/X86/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll
@@ -735,10 +735,9 @@ define i32 @reg32_lshr_by_sub_from_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_sub_from_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    negb %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -763,8 +762,8 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
@@ -782,7 +781,7 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X64-LABEL: reg64_lshr_by_sub_from_negated:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
@@ -800,9 +799,8 @@ define i32 @reg32_lshr_by_sub_of_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_sub_of_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -826,8 +824,8 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $-64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -844,7 +842,7 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X64-LABEL: reg64_lshr_by_sub_of_negated:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -862,9 +860,8 @@ define i32 @reg32_lshr_by_add_to_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_add_to_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -872,7 +869,7 @@ define i32 @reg32_lshr_by_add_to_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -887,8 +884,8 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -906,7 +903,7 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -923,9 +920,8 @@ define i32 @reg32_lshr_by_sub_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind
 ; X86-LABEL: reg32_lshr_by_sub_of_negated_amts:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -933,7 +929,7 @@ define i32 @reg32_lshr_by_sub_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -949,8 +945,8 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -967,7 +963,7 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -985,10 +981,9 @@ define i32 @reg32_lshr_by_add_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind
 ; X86-LABEL: reg32_lshr_by_add_of_negated_amts:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    negb %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1014,8 +1009,8 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movb $-128, %cl
 ; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
@@ -1033,7 +1028,7 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X64-LABEL: reg64_lshr_by_add_of_negated_amts:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
@@ -1109,10 +1104,9 @@ define i32 @reg32_lshr_by_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b) nounw
 ; X86-LABEL: reg32_lshr_by_negated_unfolded_sub_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    negb %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1138,8 +1132,8 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
@@ -1157,7 +1151,7 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw
 ; X64-LABEL: reg64_lshr_by_negated_unfolded_sub_b:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
@@ -1173,9 +1167,8 @@ define i32 @reg32_lshr_by_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b) nounw
 ; X86-LABEL: reg32_lshr_by_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1200,8 +1193,8 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $-64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -1218,7 +1211,7 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw
 ; X64-LABEL: reg64_lshr_by_b_sub_negated_unfolded:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1233,9 +1226,8 @@ define i32 @reg32_lshr_by_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounw
 ; X86-LABEL: reg32_lshr_by_negated_unfolded_add_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1243,7 +1235,7 @@ define i32 @reg32_lshr_by_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounw
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -1259,8 +1251,8 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -1278,7 +1270,7 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1355,9 +1347,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $31, %cl
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1366,9 +1358,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl $31, %ecx
-; X64-NEXT:    subl %edx, %ecx
+; X64-NEXT:    negb %cl
+; X64-NEXT:    andb $31, %cl
+; X64-NEXT:    subb %dl, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -1385,9 +1377,10 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    andb $63, %cl
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1404,9 +1397,9 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl $63, %ecx
-; X64-NEXT:    subl %edx, %ecx
+; X64-NEXT:    negb %cl
+; X64-NEXT:    andb $63, %cl
+; X64-NEXT:    subb %dl, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1421,12 +1414,11 @@ define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b
 ; X86-LABEL: reg32_lshr_by_masked_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $31, %edx
-; X86-NEXT:    subl %edx, %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    andb $31, %dl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1434,9 +1426,9 @@ define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $31, %esi
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $31, %sil
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -1452,11 +1444,12 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $63, %edx
-; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    subb %ch, %dl
+; X86-NEXT:    andb $63, %dl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1473,9 +1466,9 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $63, %sil
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1491,9 +1484,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $31, %cl
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1503,8 +1496,8 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b
 ; X64-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $31, %esi
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $31, %sil
 ; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
@@ -1522,9 +1515,10 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    andb $63, %cl
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1540,9 +1534,9 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b
 ; X64-LABEL: reg64_lshr_by_masked_negated_unfolded_add_b:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $63, %sil
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2f..4c2cd7bd907c34 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -267,9 +267,9 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $100, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    subl $92, %esp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -290,78 +290,74 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, %ecx
+; i686-NEXT:    movl %ebx, %ecx
 ; i686-NEXT:    andl $7, %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl $3, %esi
-; i686-NEXT:    andl $15, %esi
-; i686-NEXT:    movl 40(%esp,%esi), %eax
+; i686-NEXT:    shrl $3, %ebx
+; i686-NEXT:    andl $15, %ebx
+; i686-NEXT:    movl 32(%esp,%ebx), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 44(%esp,%esi), %edx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 36(%esp,%ebx), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    shll %cl, %edx
 ; i686-NEXT:    orl %eax, %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 36(%esp,%esi), %eax
+; i686-NEXT:    movl 28(%esp,%ebx), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, %edx
+; i686-NEXT:    movl %esi, %edx
 ; i686-NEXT:    andl $7, %edx
-; i686-NEXT:    shrl $3, %ebx
-; i686-NEXT:    andl $15, %ebx
-; i686-NEXT:    movl 72(%esp,%ebx), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %ebp
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 76(%esp,%ebx), %eax
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $15, %esi
+; i686-NEXT:    movl 64(%esp,%esi), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    leal (%eax,%eax), %edi
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 68(%esp,%esi), %ebp
+; i686-NEXT:    leal (%ebp,%ebp), %edi
 ; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %ebp, %edi
-; i686-NEXT:    movl 48(%esp,%esi), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl 68(%esp,%ebx), %ecx
+; i686-NEXT:    orl %eax, %edi
+; i686-NEXT:    movl 40(%esp,%ebx), %eax
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl 60(%esp,%esi), %ecx
 ; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT:    movl 80(%esp,%ebx), %esi
+; i686-NEXT:    movl 72(%esp,%esi), %esi
 ; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrdl %cl, %esi, %ebp
+; i686-NEXT:    movl %ebx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT:    shrl %cl, %ebp
+; i686-NEXT:    shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrl %cl, %eax
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %ebx, (%esp) # 4-byte Folded Spill
 ; i686-NEXT:    shrl %cl, %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl %esi, 28(%ecx)
-; i686-NEXT:    movl %ebx, 24(%ecx)
-; i686-NEXT:    movl (%esp), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 16(%ecx)
-; i686-NEXT:    movl %ebp, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
+; i686-NEXT:    movl %ebp, 24(%ecx)
+; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
+; i686-NEXT:    movl %edx, 16(%ecx)
+; i686-NEXT:    movl %eax, 12(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 8(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, (%ecx)
 ; i686-NEXT:    movl %edi, 20(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    movl %eax, 4(%ecx)
-; i686-NEXT:    addl $100, %esp
+; i686-NEXT:    addl $92, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -406,27 +402,27 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %ebx
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    sarl $31, %eax
@@ -434,74 +430,69 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:    andl $7, %ebx
+; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    andl $7, %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    shrl $3, %edi
 ; i686-NEXT:    andl $15, %edi
 ; i686-NEXT:    movl 32(%esp,%edi), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
 ; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    notl %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    notb %cl
 ; i686-NEXT:    movl 36(%esp,%edi), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    shll %cl, %edx
 ; i686-NEXT:    orl %eax, %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebp, %eax
-; i686-NEXT:    movl %ebp, %edx
+; i686-NEXT:    movl %esi, %edx
 ; i686-NEXT:    andl $7, %edx
-; i686-NEXT:    shrl $3, %eax
-; i686-NEXT:    andl $15, %eax
-; i686-NEXT:    movl 64(%esp,%eax), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %ebp
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $15, %esi
+; i686-NEXT:    movl 64(%esp,%esi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    movl 68(%esp,%eax), %esi
-; i686-NEXT:    leal (%esi,%esi), %eax
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shll %cl, %eax
-; i686-NEXT:    orl %ebp, %eax
-; i686-NEXT:    movl 28(%esp,%edi), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 68(%esp,%esi), %ebp
+; i686-NEXT:    leal (%ebp,%ebp), %ebx
+; i686-NEXT:    shll %cl, %ebx
+; i686-NEXT:    orl %eax, %ebx
+; i686-NEXT:    movl 28(%esp,%edi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl 40(%esp,%edi), %edi
-; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, %ecx
 ; i686-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    movl 60(%esp,%ecx), %ebp
-; i686-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 72(%esp,%ecx), %ebp
+; i686-NEXT:    movl 60(%esp,%esi), %ecx
+; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; i686-NEXT:    movl 72(%esp,%esi), %esi
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrdl %cl, %ebp, %esi
-; i686-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    shrdl %cl, %esi, %ebp
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    sarl %cl, %edi
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    sarl %cl, %ebp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl %ebp, 28(%ecx)
-; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 24(%ecx)
-; i686-NEXT:    movl %ebx, 16(%ecx)
-; i686-NEXT:    movl %edi, 12(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, 8(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT:    movl %edx, (%ecx)
-; i686-NEXT:    movl %eax, 20(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, 4(%ecx)
+; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    sarl %cl, %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %esi, 28(%eax)
+; i686-NEXT:    movl %ebp, 24(%eax)
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 16(%eax)
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 8(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    movl %ebx, 20(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
 ; i686-NEXT:    addl $92, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -584,8 +575,8 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl 4(%eax), %esi
 ; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    shrl %esi
-; i686-NEXT:    notl %ecx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    notb %cl
 ; i686-NEXT:    shrl %cl, %esi
 ; i686-NEXT:    orl %edx, %esi
 ; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -609,9 +600,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl 4(%esi), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    shrl %eax
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    notl %ecx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    notb %cl
 ; i686-NEXT:    shrl %cl, %eax
 ; i686-NEXT:    orl %edi, %eax
 ; i686-NEXT:    movl (%esi), %ecx

>From 39d5340fa78993531f6ff02f24fae04f6dcd4c98 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 21:18:17 -0700
Subject: [PATCH 36/47] Narrowing vectors isn't profitable | Update an
 extraordinary number of tests

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  2 ++
 llvm/test/CodeGen/X86/avx512-mask-op.ll       | 34 +++++++++----------
 llvm/test/CodeGen/X86/avx512-select.ll        |  8 ++---
 .../test/CodeGen/X86/bitcast-and-setcc-256.ll |  4 +--
 llvm/test/CodeGen/X86/fold-select.ll          |  4 +--
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    | 12 +++----
 6 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f011249d295040..0554d630305e40 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34447,6 +34447,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 
 bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
+  if (SrcVT.isVector() || DestVT.isVector())
+    return false;
   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
 }
 
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 9e689341f7b88e..856c5d96df5df7 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1952,15 +1952,15 @@ declare void @f2(i32) #1
 define void @store_i16_i1(i16 %x, ptr%y) {
 ; CHECK-LABEL: store_i16_i1:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    andb $1, %dil
 ; CHECK-NEXT:    movb %dil, (%rsi)
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: store_i16_i1:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
 ; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    retl
   %c = trunc i16 %x to i1
@@ -1971,7 +1971,7 @@ define void @store_i16_i1(i16 %x, ptr%y) {
 define void @store_i8_i1(i8 %x, ptr%y) {
 ; CHECK-LABEL: store_i8_i1:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    andb $1, %dil
 ; CHECK-NEXT:    movb %dil, (%rsi)
 ; CHECK-NEXT:    retq
 ;
@@ -3936,7 +3936,7 @@ define i8 @test_v8i1_add(i8 %x, i8 %y) {
 ; CHECK-LABEL: test_v8i1_add:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -3956,7 +3956,7 @@ define i8 @test_v8i1_sub(i8 %x, i8 %y) {
 ; CHECK-LABEL: test_v8i1_sub:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -3976,7 +3976,7 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
 ; CHECK-LABEL: test_v8i1_mul:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    andb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5128,7 +5128,7 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
 ; CHECK-LABEL: test_v1i1_add:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5148,7 +5148,7 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
 ; CHECK-LABEL: test_v1i1_sub:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5168,7 +5168,7 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
 ; CHECK-LABEL: test_v1i1_mul:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    andb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5188,15 +5188,14 @@ define <1 x i1> @uadd_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind {
 ; CHECK-LABEL: uadd_sat_v1i1:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: uadd_sat_v1i1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
   %z = call <1 x i1> @llvm.uadd.sat.v1i1(<1 x i1> %x, <1 x i1> %y)
   ret <1 x i1> %z
@@ -5257,15 +5256,14 @@ define <1 x i1> @sadd_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind {
 ; CHECK-LABEL: sadd_sat_v1i1:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: sadd_sat_v1i1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
   %z = call <1 x i1> @llvm.sadd.sat.v1i1(<1 x i1> %x, <1 x i1> %y)
   ret <1 x i1> %z
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 536c667c7ec902..2c75d68428cb80 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -137,8 +137,8 @@ define i8 @select05(i8 %a.0, i8 %m) {
 ;
 ; X64-LABEL: select05:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
@@ -212,8 +212,8 @@ define i8 @select06(i8 %a.0, i8 %m) {
 ;
 ; X64-LABEL: select06:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 234c7a0a500d30..d81a81ee93c6ed 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -873,7 +873,7 @@ define i8 @v4i32_concat_undef(<4 x i32> %vec) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    movmskps %xmm1, %eax
-; SSE2-SSSE3-NEXT:    xorl $15, %eax
+; SSE2-SSSE3-NEXT:    xorb $15, %al
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -882,7 +882,7 @@ define i8 @v4i32_concat_undef(<4 x i32> %vec) {
 ; AVX12-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX12-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX12-NEXT:    vmovmskps %xmm0, %eax
-; AVX12-NEXT:    xorl $15, %eax
+; AVX12-NEXT:    xorb $15, %al
 ; AVX12-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX12-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fold-select.ll b/llvm/test/CodeGen/X86/fold-select.ll
index 31afe979a33b35..cc9a525ba1f39a 100644
--- a/llvm/test/CodeGen/X86/fold-select.ll
+++ b/llvm/test/CodeGen/X86/fold-select.ll
@@ -19,8 +19,8 @@ define <8 x float> @select_and_v8i1(<8 x i1> %a, <8 x i1> %b, <8 x i1> %c, <8 x
 define <8 x float> @select_and_v8i1_2(i8 %m1, i8 %m2, i8 %m3, <8 x float> %d) {
 ; CHECK-LABEL: select_and_v8i1_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    orb %sil, %dil
+; CHECK-NEXT:    andb %dl, %dil
 ; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 6fe45b37f030e8..220c2e5012ea49 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2476,14 +2476,14 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
 ; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm3, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm1, %ymm1
 ; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm0
-; CHECK-AVX512VL-NEXT:    vpternlogq $50, %ymm2, %ymm1, %ymm0
+; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm3
+; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm0
+; CHECK-AVX512VL-NEXT:    vpternlogq $14, %ymm3, %ymm2, %ymm0
 ; CHECK-AVX512VL-NEXT:    retq
   %rem = srem <32 x i8> %x, <i8 13, i8 5, i8 19, i8 34, i8 2, i8 8, i8 2, i8 88, i8 62, i8 62, i8 5, i8 7, i8 97, i8 2, i8 3, i8 60, i8 3, i8 87, i8 7, i8 6, i8 84, i8 -128, i8 127, i8 56, i8 114, i8 1, i8 50, i8 7, i8 2, i8 8, i8 97, i8 117>
   %cmp = icmp ne <32 x i8> %rem, zeroinitializer

>From 9012365beff928222134f5dc54d606ee70e91295 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Fri, 13 Sep 2024 18:19:24 -0700
Subject: [PATCH 37/47] Changes with additional mov instructions

---
 .../X86/8bit_cmov_of_trunc_promotion.ll       | 178 +++++++++---------
 llvm/test/CodeGen/X86/movmsk-cmp.ll           |  26 +--
 ...unfold-masked-merge-scalar-variablemask.ll |  24 +--
 3 files changed, 117 insertions(+), 111 deletions(-)

diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
index 1a8d33f5b3480f..f4b54bf22d0f52 100644
--- a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
+++ b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
@@ -11,62 +11,63 @@
 define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: t0:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    addb %cl, %al
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB0_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
 ; I386-NOCMOV-NEXT:    movl %ecx, %eax
 ; I386-NOCMOV-NEXT:  .LBB0_2:
-; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: t0:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
-; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addb %al, %cl
+; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    movzbl %cl, %ecx
+; I386-CMOV-NEXT:    movzbl %al, %eax
 ; I386-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: t0:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    addb %cl, %al
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB0_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
 ; I686-NOCMOV-NEXT:    movl %ecx, %eax
 ; I686-NOCMOV-NEXT:  .LBB0_2:
-; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-NOCMOV-NEXT:    retl
 ;
 ; I686-CMOV-LABEL: t0:
 ; I686-CMOV:       # %bb.0:
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
-; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addb %al, %cl
+; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    movzbl %cl, %ecx
+; I686-CMOV-NEXT:    movzbl %al, %eax
 ; I686-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: t0:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
-; X86_64-NEXT:    leal (%rsi,%rdx), %eax
-; X86_64-NEXT:    cmpb %al, %cl
+; X86_64-NEXT:    addb %dl, %dil
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %dil, %ecx
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
@@ -84,64 +85,63 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: neg_only_one_truncation:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    addb %al, %cl
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB1_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
 ; I386-NOCMOV-NEXT:    movl %ecx, %eax
 ; I386-NOCMOV-NEXT:  .LBB1_2:
-; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: neg_only_one_truncation:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addb %al, %cl
 ; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
-; I386-CMOV-NEXT:    cmpb %al, %cl
-; I386-CMOV-NEXT:    movzbl %al, %eax
-; I386-CMOV-NEXT:    cmovgl %ecx, %eax
+; I386-CMOV-NEXT:    cmpb %cl, %al
+; I386-CMOV-NEXT:    movzbl %al, %edx
+; I386-CMOV-NEXT:    movzbl %cl, %eax
+; I386-CMOV-NEXT:    cmovgl %edx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: neg_only_one_truncation:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    addb %al, %cl
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB1_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
 ; I686-NOCMOV-NEXT:    movl %ecx, %eax
 ; I686-NOCMOV-NEXT:  .LBB1_2:
-; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-NOCMOV-NEXT:    retl
 ;
 ; I686-CMOV-LABEL: neg_only_one_truncation:
 ; I686-CMOV:       # %bb.0:
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addb %al, %cl
 ; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
-; I686-CMOV-NEXT:    cmpb %al, %cl
-; I686-CMOV-NEXT:    movzbl %al, %eax
-; I686-CMOV-NEXT:    cmovgl %ecx, %eax
+; I686-CMOV-NEXT:    cmpb %cl, %al
+; I686-CMOV-NEXT:    movzbl %al, %edx
+; I686-CMOV-NEXT:    movzbl %cl, %eax
+; I686-CMOV-NEXT:    cmovgl %edx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: neg_only_one_truncation:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
-; X86_64-NEXT:    addb %sil, %dl
-; X86_64-NEXT:    cmpb %dl, %cl
-; X86_64-NEXT:    movzbl %dl, %eax
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    addb %dl, %dil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %dil, %ecx
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
@@ -159,62 +159,63 @@ define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nou
 define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: neg_type_mismatch:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addw {{[0-9]+}}(%esp), %cx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    addb %cl, %al
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB2_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
 ; I386-NOCMOV-NEXT:    movl %ecx, %eax
 ; I386-NOCMOV-NEXT:  .LBB2_2:
-; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: neg_type_mismatch:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
-; I386-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addb %al, %cl
+; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    movzbl %cl, %ecx
+; I386-CMOV-NEXT:    movzbl %al, %eax
 ; I386-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: neg_type_mismatch:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addw {{[0-9]+}}(%esp), %cx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    addb %cl, %al
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB2_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
 ; I686-NOCMOV-NEXT:    movl %ecx, %eax
 ; I686-NOCMOV-NEXT:  .LBB2_2:
-; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-NOCMOV-NEXT:    retl
 ;
 ; I686-CMOV-LABEL: neg_type_mismatch:
 ; I686-CMOV:       # %bb.0:
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
-; I686-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addb %al, %cl
+; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    movzbl %cl, %ecx
+; I686-CMOV-NEXT:    movzbl %al, %eax
 ; I686-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: neg_type_mismatch:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
-; X86_64-NEXT:    leal (%rsi,%rdx), %eax
-; X86_64-NEXT:    cmpb %al, %cl
+; X86_64-NEXT:    addb %dl, %dil
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %dil, %ecx
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
@@ -234,8 +235,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I386-NOCMOV-LABEL: negative_CopyFromReg:
 ; I386-NOCMOV:       # %bb.0:
 ; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB3_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
@@ -246,9 +247,10 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I386-CMOV-LABEL: negative_CopyFromReg:
 ; I386-CMOV:       # %bb.0:
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    movzbl %al, %eax
 ; I386-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
@@ -256,8 +258,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I686-NOCMOV-LABEL: negative_CopyFromReg:
 ; I686-NOCMOV:       # %bb.0:
 ; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB3_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
@@ -268,19 +270,19 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I686-CMOV-LABEL: negative_CopyFromReg:
 ; I686-CMOV:       # %bb.0:
 ; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    movzbl %al, %eax
 ; I686-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: negative_CopyFromReg:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X86_64-NEXT:    leal (%rsi,%rdx), %eax
-; X86_64-NEXT:    cmpb %al, %dil
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %edi, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 253f990f8735ee..1d80529ee9a7bd 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -3655,11 +3655,11 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SSE-NEXT:    shrl $15, %ecx
 ; SSE-NEXT:    movl %eax, %edx
 ; SSE-NEXT:    shrl $8, %edx
-; SSE-NEXT:    andl $1, %edx
+; SSE-NEXT:    andb $1, %dl
 ; SSE-NEXT:    andl $8, %eax
 ; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    xorl %edx, %eax
-; SSE-NEXT:    andl %ecx, %eax
+; SSE-NEXT:    xorb %dl, %al
+; SSE-NEXT:    andb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -3671,11 +3671,11 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; AVX1OR2-NEXT:    shrl $15, %ecx
 ; AVX1OR2-NEXT:    movl %eax, %edx
 ; AVX1OR2-NEXT:    shrl $8, %edx
-; AVX1OR2-NEXT:    andl $1, %edx
+; AVX1OR2-NEXT:    andb $1, %dl
 ; AVX1OR2-NEXT:    andl $8, %eax
 ; AVX1OR2-NEXT:    shrl $3, %eax
-; AVX1OR2-NEXT:    xorl %edx, %eax
-; AVX1OR2-NEXT:    andl %ecx, %eax
+; AVX1OR2-NEXT:    xorb %dl, %al
+; AVX1OR2-NEXT:    andb %cl, %al
 ; AVX1OR2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1OR2-NEXT:    retq
 ;
@@ -3685,8 +3685,8 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; KNL-NEXT:    vpextrb $15, %xmm0, %ecx
 ; KNL-NEXT:    vpextrb $8, %xmm0, %edx
 ; KNL-NEXT:    vpextrb $3, %xmm0, %eax
-; KNL-NEXT:    xorl %edx, %eax
-; KNL-NEXT:    andl %ecx, %eax
+; KNL-NEXT:    xorb %dl, %al
+; KNL-NEXT:    andb %cl, %al
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    retq
 ;
@@ -4171,7 +4171,9 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
+; Is this being performed by a different transofmrations that's using isNarrowingProfitable()?
+; SSE2-NEXT:    movzbl %al, %eax
 ; SSE2-NEXT:    btl %edi, %eax
 ; SSE2-NEXT:    setb %al
 ; SSE2-NEXT:    retq
@@ -4180,7 +4182,8 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
 ; SSE41-NEXT:    movmskpd %xmm0, %eax
-; SSE41-NEXT:    xorl $3, %eax
+; SSE41-NEXT:    xorb $3, %al
+; SSE41-NEXT:    movzbl %al, %eax
 ; SSE41-NEXT:    btl %edi, %eax
 ; SSE41-NEXT:    setb %al
 ; SSE41-NEXT:    retq
@@ -4189,7 +4192,8 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vmovmskpd %xmm0, %eax
-; AVX1OR2-NEXT:    xorl $3, %eax
+; AVX1OR2-NEXT:    xorb $3, %al
+; AVX1OR2-NEXT:    movzbl %al, %eax
 ; AVX1OR2-NEXT:    btl %edi, %eax
 ; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 9c9d06921096cb..6889fe1edff4b3 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,18 +6,17 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notb %al
-; CHECK-NOBMI-NEXT:    andb %sil, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb %dl, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
 ; CHECK-BMI-NEXT:    movl %edx, %eax
-; CHECK-BMI-NEXT:    andl %edx, %edi
+; CHECK-BMI-NEXT:    andb %al, %dil
 ; CHECK-BMI-NEXT:    notb %al
 ; CHECK-BMI-NEXT:    andb %sil, %al
 ; CHECK-BMI-NEXT:    orb %dil, %al
@@ -106,17 +105,18 @@ define i8 @in8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: in8:
 ; CHECK-NOBMI:       # %bb.0:
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb %dl, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
-; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    orl %edi, %eax
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb %dl, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y

>From 3a1b38cdf67579becbd7034dc9c6fd1f7b9df302 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Fri, 13 Sep 2024 18:20:22 -0700
Subject: [PATCH 38/47] Changes with non-trivial diffs

---
 llvm/test/CodeGen/X86/ucmp.ll                 |  334 +-
 .../X86/wide-scalar-shift-legalization.ll     | 3114 ++++++++---------
 2 files changed, 1671 insertions(+), 1777 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index ac35605be4d52e..fb58f3064637b2 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -1484,26 +1484,26 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    cmpq %r14, %rcx
 ; X64-NEXT:    sbbq %rdx, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; X64-NEXT:    sbbb $0, %bl
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    cmpq %rcx, %r15
+; X64-NEXT:    cmpq %rcx, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r14
-; X64-NEXT:    setb %r14b
-; X64-NEXT:    cmpq %r15, %rcx
+; X64-NEXT:    sbbq %rax, %r15
+; X64-NEXT:    setb %r15b
+; X64-NEXT:    cmpq %r14, %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
 ; X64-NEXT:    sbbq %rdx, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    sbbb $0, %r14b
+; X64-NEXT:    sbbb $0, %r15b
 ; X64-NEXT:    cmpq %r12, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    sbbq %rcx, %r15
-; X64-NEXT:    setb %r15b
+; X64-NEXT:    sbbq %rcx, %r14
+; X64-NEXT:    setb %r14b
 ; X64-NEXT:    cmpq %rax, %r12
 ; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X64-NEXT:    movd %eax, %xmm0
@@ -1535,7 +1535,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X64-NEXT:    movd %eax, %xmm13
 ; X64-NEXT:    movzbl %bl, %eax
 ; X64-NEXT:    movd %eax, %xmm14
-; X64-NEXT:    movzbl %r14b, %eax
+; X64-NEXT:    movzbl %r15b, %eax
 ; X64-NEXT:    movd %eax, %xmm15
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -1553,11 +1553,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X64-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0]
 ; X64-NEXT:    sbbq %rdx, %rcx
-; X64-NEXT:    sbbb $0, %r15b
-; X64-NEXT:    movzbl %r15b, %eax
-; X64-NEXT:    andl $3, %eax
+; X64-NEXT:    sbbb $0, %r14b
+; X64-NEXT:    andb $3, %r14b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movb %al, 4(%rdi)
+; X64-NEXT:    movb %r14b, 4(%rdi)
 ; X64-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    andl $3, %eax
@@ -1876,22 +1875,22 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %bl
 ; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
@@ -1917,208 +1916,213 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %edi, %esi
 ; X86-NEXT:    sbbl %ebp, %ebx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    setb %dl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %dl
 ; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    sbbl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    sbbb $0, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl %esi, %eax
 ; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %bh
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sbbb $0, %bh
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb %cl, 4(%edi)
-; X86-NEXT:    movzbl %bh, %ebp
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ebp
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    leal (%ecx,%ebp,4), %ecx
 ; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $4, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    shll $6, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl $3, %edx
+; X86-NEXT:    leal (%edx,%eax,4), %edx
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $4, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    andl $3, %ebp
+; X86-NEXT:    shll $6, %ebp
+; X86-NEXT:    orl %edi, %ebp
 ; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    shll $8, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    andl $3, %edx
 ; X86-NEXT:    shll $10, %edx
 ; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    shll $12, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $12, %eax
+; X86-NEXT:    shll $14, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $14, %ecx
+; X86-NEXT:    shll $16, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    shll $18, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %eax
 ; X86-NEXT:    shll $20, %eax
 ; X86-NEXT:    orl %esi, %eax
@@ -2127,22 +2131,24 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    shll $22, %ecx
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $24, %edi
+; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    shll $24, %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shll $26, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    shll $26, %ebx
+; X86-NEXT:    shll $28, %ebx
 ; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $28, %ecx
+; X86-NEXT:    shll $30, %ecx
 ; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    shll $30, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, (%edi)
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    andb $3, %dl
+; X86-NEXT:    movb %dl, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    addl $132, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc87970..636b9cd87e8de4 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -3375,7 +3375,6 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
@@ -3383,19 +3382,19 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -3406,84 +3405,80 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r13,%r13), %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
@@ -3530,13 +3525,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %sil
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
@@ -3551,10 +3545,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
@@ -3568,9 +3562,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 40(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
@@ -3620,50 +3614,47 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
@@ -3714,12 +3705,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %r12d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r12b
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
@@ -3727,14 +3717,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rbx,%rbx), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r15, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r14, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r12, %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r13
@@ -3745,7 +3735,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 16(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
@@ -3762,42 +3752,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $188, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3806,7 +3796,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -3834,196 +3824,183 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4116,12 +4093,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
@@ -4251,48 +4227,50 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $180, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4307,8 +4285,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -4325,147 +4303,134 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 52(%esp,%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $180, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4487,9 +4452,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
@@ -4524,18 +4489,18 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -4554,103 +4519,101 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4680,7 +4643,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
@@ -4716,85 +4678,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r14), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r14), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r14), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r14), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r14), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r14), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%rbx), %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
@@ -4842,8 +4800,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %sil
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r10), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r10), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rdi
@@ -4934,55 +4891,52 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rcx), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r8, %r11, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r13, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
@@ -5038,8 +4992,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ebp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %bpl
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
@@ -5086,41 +5039,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5137,7 +5090,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5163,174 +5118,165 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5452,8 +5398,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
@@ -5585,45 +5530,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $216, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $192, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5641,13 +5586,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -5664,156 +5609,139 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, (%esp), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 212(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $216, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $192, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5827,43 +5755,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5876,19 +5802,21 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %edx, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -5905,125 +5833,121 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%esi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, (%esp) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 56(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 56(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%esi)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -6039,7 +5963,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
@@ -6047,108 +5970,104 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r13,%r13), %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
@@ -6196,13 +6115,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %sil
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
@@ -6217,10 +6135,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
@@ -6234,9 +6152,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 40(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
@@ -6287,50 +6205,47 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
@@ -6382,12 +6297,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %r12d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r12b
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
@@ -6395,14 +6309,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rbx,%rbx), %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r10
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r15, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r13,%r13), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r12, %r14, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r12
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r12, %rbp
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r13
@@ -6413,7 +6327,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 16(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 40(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
@@ -6430,7 +6344,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $188, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6443,7 +6357,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
@@ -6452,19 +6366,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6473,7 +6387,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6503,196 +6417,183 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $7, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $188, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -6786,12 +6687,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
@@ -6921,7 +6821,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $184, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6938,205 +6838,194 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 56(%esp,%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %ebp, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $184, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -7175,118 +7064,117 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%eax), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -7300,42 +7188,42 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%edx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 32(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 32(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 24(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 16(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 16(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $200, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi

>From 2970a356e745887c913a9ed238a2781ab7ab801a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Fri, 13 Sep 2024 18:29:16 -0700
Subject: [PATCH 39/47] Changes with a higher instruction count

---
 llvm/test/CodeGen/X86/bitselect.ll | 30 ++++++++++++++++++++----------
 llvm/test/CodeGen/X86/bool-math.ll | 15 ++++++++-------
 llvm/test/CodeGen/X86/subcarry.ll  | 19 +++++++++++--------
 llvm/test/CodeGen/X86/xor-icmp.ll  | 15 ++++++++++-----
 4 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 2922113b14ea90..347cdf39524479 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI
 
 ; PR46472
 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m))
@@ -17,14 +17,24 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: bitselect_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:    notb %al
-; X64-NEXT:    andb %dil, %al
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    retq
+; X64-NOBMI-LABEL: bitselect_i8:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorb %dil, %al
+; X64-NOBMI-NEXT:    andb %dl, %al
+; X64-NOBMI-NEXT:    xorb %dil, %al
+; X64-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: bitselect_i8:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    movl %edx, %eax
+; X64-BMI-NEXT:    andb %al, %sil
+; X64-BMI-NEXT:    notb %al
+; X64-BMI-NEXT:    andb %dil, %al
+; X64-BMI-NEXT:    orb %sil, %al
+; X64-BMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-BMI-NEXT:    retq
   %not = xor i8 %m, -1
   %ma = and i8 %a, %not
   %mb = and i8 %b, %m
diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll
index b73af677bc6cdd..dd8511f3a81524 100644
--- a/llvm/test/CodeGen/X86/bool-math.ll
+++ b/llvm/test/CodeGen/X86/bool-math.ll
@@ -262,19 +262,20 @@ define i8 @low_bit_select_constants_bigger_true_narrower_result(i16 %x) {
 define i1 @opaque_constant(i48 %x, i48 %y) {
 ; X64-LABEL: opaque_constant:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq $32, %rdi
 ; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorb %dil, %al
+; X64-NEXT:    andb $1, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: opaque_constant:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl $1, %ecx
+; X32-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb %cl, %al
 ; X32-NEXT:    retl
   %andx = and i48 %x, 4294967296
   %andy = and i48 %y, 4294967296
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 1e9db9f55a8d5c..1570a3b41b2053 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -46,13 +46,16 @@ define %S @negate(ptr nocapture readonly %this) {
 ; CHECK-NEXT:    subq (%rsi), %rdx
 ; CHECK-NEXT:    movl $0, %edi
 ; CHECK-NEXT:    sbbq 8(%rsi), %rdi
-; CHECK-NEXT:    movl $0, %r8d
-; CHECK-NEXT:    sbbq 16(%rsi), %r8
-; CHECK-NEXT:    sbbq 24(%rsi), %rcx
+; CHECK-NEXT:    sbbq 16(%rsi), %rcx
+; CHECK-NEXT:    setae %r8b
+; CHECK-NEXT:    movzbl %r8b, %r8d
+; CHECK-NEXT:    movq 24(%rsi), %rsi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    addq %r8, %rsi
 ; CHECK-NEXT:    movq %rdx, (%rax)
 ; CHECK-NEXT:    movq %rdi, 8(%rax)
-; CHECK-NEXT:    movq %r8, 16(%rax)
-; CHECK-NEXT:    movq %rcx, 24(%rax)
+; CHECK-NEXT:    movq %rcx, 16(%rax)
+; CHECK-NEXT:    movq %rsi, 24(%rax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i64, ptr %this, align 8
@@ -108,13 +111,13 @@ define %S @sub(ptr nocapture readonly %this, %S %arg.b) {
 ; CHECK-NEXT:    movzbl %r10b, %r10d
 ; CHECK-NEXT:    notq %r8
 ; CHECK-NEXT:    addq %rdx, %r8
-; CHECK-NEXT:    adcq 24(%rsi), %r10
 ; CHECK-NEXT:    notq %r9
-; CHECK-NEXT:    addq %r10, %r9
+; CHECK-NEXT:    adcq 24(%rsi), %r10
+; CHECK-NEXT:    addq %r9, %r10
 ; CHECK-NEXT:    movq %rdi, (%rax)
 ; CHECK-NEXT:    movq %rcx, 8(%rax)
 ; CHECK-NEXT:    movq %r8, 16(%rax)
-; CHECK-NEXT:    movq %r9, 24(%rax)
+; CHECK-NEXT:    movq %r10, 24(%rax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = extractvalue %S %arg.b, 0
diff --git a/llvm/test/CodeGen/X86/xor-icmp.ll b/llvm/test/CodeGen/X86/xor-icmp.ll
index 16a3b6cb855a7d..59db27ef5e6dd1 100644
--- a/llvm/test/CodeGen/X86/xor-icmp.ll
+++ b/llvm/test/CodeGen/X86/xor-icmp.ll
@@ -15,11 +15,16 @@ define i32 @t(i32 %a, i32 %b) nounwind ssp {
 ;
 ; X64-LABEL: t:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    shrl $14, %edi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    btl $14, %esi
+; X64-NEXT:    sbbb $0, %dil
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # %bb.2: # %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp bar # TAILCALL
+; X64-NEXT:  .LBB0_1: # %bb
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    testl $16384, %edi # imm = 0x4000
-; X64-NEXT:    jne bar # TAILCALL
-; X64-NEXT:  # %bb.1: # %bb
 ; X64-NEXT:    jmp foo # TAILCALL
 entry:
   %0 = and i32 %a, 16384
@@ -96,7 +101,7 @@ define i1 @xor_not_bools(i1 zeroext %x, i1 zeroext %y) nounwind {
 ; X64-LABEL: xor_not_bools:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    xorb %sil, %al
 ; X64-NEXT:    xorb $1, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq

>From 93214ce3ea3f53ed12d92ae455bced9cbe6759dc Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:10:18 -0700
Subject: [PATCH 40/47] Non-trivial diff

---
 llvm/test/CodeGen/X86/pr32329.ll | 95 ++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index d9671aa04f4603..4a4082a2a59ed4 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -29,33 +29,40 @@ define void @foo() local_unnamed_addr {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movsbl var_27, %eax
-; X86-NEXT:    movzwl var_2, %ebx
-; X86-NEXT:    movl var_310, %ecx
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    addl var_24, %ecx
-; X86-NEXT:    movl $4194303, %esi # imm = 0x3FFFFF
-; X86-NEXT:    andl obj, %esi
+; X86-NEXT:    movl obj, %esi
 ; X86-NEXT:    leal (%esi,%esi), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movzwl var_2, %eax
+; X86-NEXT:    movb var_310, %ch
+; X86-NEXT:    movsbl var_27, %ebx
+; X86-NEXT:    subb %bl, %dl
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    mulb %bl
+; X86-NEXT:    addb var_24, %al
+; X86-NEXT:    mulb %cl
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addb $113, %cl
-; X86-NEXT:    movl $9, %ebx
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    movl $9, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %ebx, %ebp
+; X86-NEXT:    cmovnel %eax, %edi
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovnel %ecx, %ebx
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ebp, var_50+4
-; X86-NEXT:    movl %ebx, var_50
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    andl $4194303, %esi # imm = 0x3FFFFF
+; X86-NEXT:    leal (%esi,%esi), %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    subl %ebp, %ecx
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    movl %edi, var_50+4
+; X86-NEXT:    movl %eax, var_50
 ; X86-NEXT:    setge var_205
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    movb %dl, var_218
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    mulb %bl
+; X86-NEXT:    movb %al, var_218
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
@@ -68,27 +75,33 @@ define void @foo() local_unnamed_addr {
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movsbl var_27(%rip), %eax
-; X64-NEXT:    movzwl var_2(%rip), %edx
-; X64-NEXT:    movl var_310(%rip), %ecx
-; X64-NEXT:    imull %eax, %ecx
-; X64-NEXT:    addl var_24(%rip), %ecx
-; X64-NEXT:    movl $4194303, %esi # imm = 0x3FFFFF
-; X64-NEXT:    andl obj(%rip), %esi
-; X64-NEXT:    leal (%rsi,%rsi), %edi
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %r8d
-; X64-NEXT:    subl %edx, %r8d
-; X64-NEXT:    imull %r8d, %ecx
-; X64-NEXT:    addb $113, %cl
-; X64-NEXT:    movl $9, %edx
+; X64-NEXT:    movl obj(%rip), %esi
+; X64-NEXT:    leal (%rsi,%rsi), %edx
+; X64-NEXT:    movzwl var_2(%rip), %ecx
+; X64-NEXT:    movzwl %cx, %r8d
+; X64-NEXT:    movzbl var_310(%rip), %eax
+; X64-NEXT:    movsbl var_27(%rip), %edi
+; X64-NEXT:    subb %dil, %dl
+; X64-NEXT:    movl %edx, %r9d
+; X64-NEXT:    subb %cl, %r9b
+; X64-NEXT:    mulb %dil
+; X64-NEXT:    addb var_24(%rip), %al
+; X64-NEXT:    mulb %r9b
+; X64-NEXT:    # kill: def $al killed $al def $rax
+; X64-NEXT:    leal 113(%rax), %ecx
+; X64-NEXT:    movl $9, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq %rdx, var_50(%rip)
-; X64-NEXT:    cmpl %esi, %r8d
+; X64-NEXT:    shlq %cl, %rax
+; X64-NEXT:    movq %rax, var_50(%rip)
+; X64-NEXT:    andl $4194303, %esi # imm = 0x3FFFFF
+; X64-NEXT:    leal (%rsi,%rsi), %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subl %r8d, %eax
+; X64-NEXT:    cmpl %esi, %eax
 ; X64-NEXT:    setge var_205(%rip)
-; X64-NEXT:    imull %eax, %edi
-; X64-NEXT:    movb %dil, var_218(%rip)
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    mulb %dil
+; X64-NEXT:    movb %al, var_218(%rip)
 ; X64-NEXT:    retq
   entry:
   %bf.load = load i32, ptr @obj, align 8

>From 0aadf98c856ec0f0655eff0556ad3070128719cf Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:25:06 -0700
Subject: [PATCH 41/47] Prefer readable condition

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0554d630305e40..839ee9b61c8298 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34447,7 +34447,7 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 
 bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
-  if (SrcVT.isVector() || DestVT.isVector())
+  if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
     return false;
   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
 }

>From 6f401cbb988bc6706b118171bd25f0416b1f4736 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:42:14 -0700
Subject: [PATCH 42/47] Update 2 X86 tests that I missed

---
 llvm/test/CodeGen/X86/vec_saddo.ll | 8 ++------
 llvm/test/CodeGen/X86/vec_uaddo.ll | 8 ++------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 0aea89e4acf018..460c5fe11f82a5 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,16 +1045,12 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: saddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
-; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 34e2811cb9bb5c..bac118095331ca 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,16 +1098,12 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
-; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq

>From e7f2341680b3a5ae9467fbf358edfd8a6eb24139 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:43:05 -0700
Subject: [PATCH 43/47] Update register names for NVPTX/boolean-patterns.ll

---
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index c30ebccdf9199c..fc3a48067a4c8a 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -43,14 +43,14 @@ define i1 @m2and_ri(i1 %a) {
 define i1 @select2or(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2or(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_0];
 ; CHECK-NEXT:    ld.param.u8 %rs2, [select2or_param_1];
-; CHECK-NEXT:    or.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    or.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;
@@ -61,14 +61,14 @@ define i1 @select2or(i1 %a, i1 %b) {
 define i1 @select2and(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2and(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_0];
 ; CHECK-NEXT:    ld.param.u8 %rs2, [select2and_param_1];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    and.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;

>From a79e2c231717bc3329f4d4e89cafc76922298418 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 19:02:23 -0700
Subject: [PATCH 44/47] Move isTypeLegal condition up

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 012cb1eac2a8b4..38f7fe6a97d691 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5787,6 +5787,9 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
   if (HandOpcode == ISD::TRUNCATE) {
+    // Don't create a logic op on an illegal type.
+    if (!TLI.isTypeLegal(XVT))
+      return SDValue();
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
     if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -5805,9 +5808,6 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     // transformation.
     if (TLI.isNarrowingProfitable(XVT, VT))
       return SDValue();
-    // Don't create a logic op on an illegal type.
-    if (!TLI.isTypeLegal(XVT))
-      return SDValue();
     SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
     return DAG.getNode(HandOpcode, DL, VT, Logic);
   }

>From 7632ec0c854e87442274e66336567057aead5ae1 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 18 Sep 2024 19:53:11 -0700
Subject: [PATCH 45/47] Update AMDGPU/computeNumSignBits-mul.ll

---
 llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
index 11795cca18daa2..07a4478241b186 100644
--- a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
@@ -5,9 +5,9 @@ define i16 @num_sign_bits_mul_i48_0(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 ; GFX9-LABEL: num_sign_bits_mul_i48_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %A = sext i8 %X to i48
   %B = sext i8 %Y to i48
@@ -24,8 +24,8 @@ define i16 @num_sign_bits_mul_i48_1(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 ; GFX9-LABEL: num_sign_bits_mul_i48_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]

>From bd564d026386f3fa8da04e1151e652f3505cfc7c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 19 Sep 2024 19:19:14 -0700
Subject: [PATCH 46/47] Update X86 bit_ceil.ll & scmp.ll

---
 llvm/test/CodeGen/X86/bit_ceil.ll | 16 ++++++++--------
 llvm/test/CodeGen/X86/scmp.ll     |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll
index 823453087f6180..13ad9b98ebcc4f 100644
--- a/llvm/test/CodeGen/X86/bit_ceil.ll
+++ b/llvm/test/CodeGen/X86/bit_ceil.ll
@@ -13,8 +13,8 @@ define i32 @bit_ceil_i32(i32 %x) {
 ; NOBMI-NEXT:    bsrl %eax, %eax
 ; NOBMI-NEXT:    movl $63, %ecx
 ; NOBMI-NEXT:    cmovnel %eax, %ecx
-; NOBMI-NEXT:    xorl $31, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-32, %cl
+; NOBMI-NEXT:    addb $33, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -50,8 +50,8 @@ define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
 ; NOBMI-NEXT:    bsrl %edi, %eax
 ; NOBMI-NEXT:    movl $63, %ecx
 ; NOBMI-NEXT:    cmovnel %eax, %ecx
-; NOBMI-NEXT:    xorl $31, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-32, %cl
+; NOBMI-NEXT:    addb $33, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -89,8 +89,8 @@ define i64 @bit_ceil_i64(i64 %x) {
 ; NOBMI-NEXT:    bsrq %rax, %rax
 ; NOBMI-NEXT:    movl $127, %ecx
 ; NOBMI-NEXT:    cmovneq %rax, %rcx
-; NOBMI-NEXT:    xorl $63, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-64, %cl
+; NOBMI-NEXT:    addb $65, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -125,8 +125,8 @@ define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
 ; NOBMI-NEXT:    bsrq %rdi, %rax
 ; NOBMI-NEXT:    movl $127, %ecx
 ; NOBMI-NEXT:    cmovneq %rax, %rcx
-; NOBMI-NEXT:    xorl $63, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-64, %cl
+; NOBMI-NEXT:    addb $65, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 7d4bbb06534e6c..40e7b2d9cfdd84 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -1783,7 +1783,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    movl %edi, 28(%ebx)
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl $18, %eax
-; X86-NEXT:    andl $7, %eax
+; X86-NEXT:    andb $7, %al
 ; X86-NEXT:    movb %al, 102(%ebx)
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    addl $52, %esp

>From b3abc5cb73c0af97dc6b5f60f834769df9370191 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 19 Sep 2024 19:25:14 -0700
Subject: [PATCH 47/47] Address comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 38f7fe6a97d691..eb19a4164e39f3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5804,7 +5804,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     // widening a binop.
     if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
       return SDValue();
-    // Prevent an infinite loop if the target preferts the inverse
+    // Prevent an infinite loop if the target prefers the inverse
     // transformation.
     if (TLI.isNarrowingProfitable(XVT, VT))
       return SDValue();



More information about the llvm-commits mailing list