[llvm] r348946 - [x86] allow 8-bit adds to be promoted by convertToThreeAddress() to form LEA

Wed Dec 12 09:58:27 PST 2018

Author: spatel
Date: Wed Dec 12 09:58:27 2018
New Revision: 348946

URL: http://llvm.org/viewvc/llvm-project?rev=348946&view=rev
Log:
[x86] allow 8-bit adds to be promoted by convertToThreeAddress() to form LEA

This extends the code that handles 16-bit add promotion to form LEA to also allow 8-bit adds. 
That allows us to combine add ops with register moves and save some instructions. This is 
another step towards allowing add truncation in generic DAGCombiner (see D54640).

Differential Revision: https://reviews.llvm.org/D55494

Modified:
    llvm/trunk/lib/Target/X86/X86InstrArithmetic.td
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/lib/Target/X86/X86InstrInfo.h
    llvm/trunk/test/CodeGen/X86/GlobalISel/add-scalar.ll
    llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
    llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar.ll
    llvm/trunk/test/CodeGen/X86/fixup-bw-copy.ll
    llvm/trunk/test/CodeGen/X86/fshr.ll
    llvm/trunk/test/CodeGen/X86/iabs.ll
    llvm/trunk/test/CodeGen/X86/mul-constant-i8.ll
    llvm/trunk/test/CodeGen/X86/popcnt.ll
    llvm/trunk/test/CodeGen/X86/pr23664.ll
    llvm/trunk/test/CodeGen/X86/rotate4.ll
    llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrArithmetic.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrArithmetic.td?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86InstrArithmetic.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrArithmetic.td Wed Dec 12 09:58:27 2018
@@ -913,8 +913,8 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc
   let Defs = [EFLAGS] in {
     let Constraints = "$src1 = $dst" in {
       let isCommutable = CommutableRR in {
-        def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
         let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
           def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
           def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
           def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
@@ -931,9 +931,9 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc
       def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
       def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
 
-      def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
-
       let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
         // NOTE: These are order specific, we want the ri8 forms to be listed
         // first so that they are slightly preferred to the ri forms.
         def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Dec 12 09:58:27 2018
@@ -797,6 +797,13 @@ bool X86InstrInfo::classifyLEAReg(Machin
 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
     unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
     LiveVariables *LV) const {
+  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
+  bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri);
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+              *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
+         "Unexpected type for LEA transform");
+
   // TODO: For a 32-bit target, we need to adjust the LEA variables with
   // something like this:
   //   Opcode = X86::LEA32r;
@@ -807,13 +814,12 @@ MachineInstr *X86InstrInfo::convertToThr
   if (!Subtarget.is64Bit())
     return nullptr;
 
-  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
   unsigned Opcode = X86::LEA64_32r;
   unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
   unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
 
   // Build and insert into an implicit UNDEF value. This is OK because
-  // we will be shifting and then extracting the lower 16-bits.
+  // we will be shifting and then extracting the lower 8/16-bits.
   // This has the potential to cause partial register stall. e.g.
   //   movw    (%rbp,%rcx,2), %dx
   //   leal    -65(%rdx), %esi
@@ -824,11 +830,12 @@ MachineInstr *X86InstrInfo::convertToThr
   unsigned Src = MI.getOperand(1).getReg();
   bool IsDead = MI.getOperand(0).isDead();
   bool IsKill = MI.getOperand(1).isKill();
+  unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit;
   assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
   BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
   MachineInstr *InsMI =
       BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
-          .addReg(InRegLEA, RegState::Define, X86::sub_16bit)
+          .addReg(InRegLEA, RegState::Define, SubReg)
           .addReg(Src, getKillRegState(IsKill));
 
   MachineInstrBuilder MIB =
@@ -847,12 +854,14 @@ MachineInstr *X86InstrInfo::convertToThr
   case X86::DEC16r:
     addRegOffset(MIB, InRegLEA, true, -1);
     break;
+  case X86::ADD8ri:
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
     addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
     break;
+  case X86::ADD8rr:
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
     unsigned Src2 = MI.getOperand(2).getReg();
@@ -861,7 +870,7 @@ MachineInstr *X86InstrInfo::convertToThr
     unsigned InRegLEA2 = 0;
     MachineInstr *InsMI2 = nullptr;
     if (Src == Src2) {
-      // ADD16rr killed %reg1028, %reg1028
+      // ADD8rr/ADD16rr killed %reg1028, %reg1028
       // just a single insert_subreg.
       addRegReg(MIB, InRegLEA, true, InRegLEA, false);
     } else {
@@ -870,10 +879,10 @@ MachineInstr *X86InstrInfo::convertToThr
       else
         InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
-      // we will be shifting and then extracting the lower 16-bits.
+      // we will be shifting and then extracting the lower 8/16-bits.
       BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
       InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
-                   .addReg(InRegLEA2, RegState::Define, X86::sub_16bit)
+                   .addReg(InRegLEA2, RegState::Define, SubReg)
                    .addReg(Src2, getKillRegState(IsKill2));
       addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
     }
@@ -887,7 +896,7 @@ MachineInstr *X86InstrInfo::convertToThr
   MachineInstr *ExtMI =
       BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
           .addReg(Dest, RegState::Define | getDeadRegState(IsDead))
-          .addReg(OutRegLEA, RegState::Kill, X86::sub_16bit);
+          .addReg(OutRegLEA, RegState::Kill, SubReg);
 
   if (LV) {
     // Update live variables.
@@ -1084,6 +1093,7 @@ X86InstrInfo::convertToThreeAddress(Mach
       LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
     break;
   }
+  case X86::ADD8rr:
   case X86::ADD16rr:
   case X86::ADD16rr_DB:
     return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
@@ -1119,6 +1129,7 @@ X86InstrInfo::convertToThreeAddress(Mach
     NewMI = addOffset(MIB, MI.getOperand(2));
     break;
   }
+  case X86::ADD8ri:
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Wed Dec 12 09:58:27 2018
@@ -584,9 +584,9 @@ protected:
                        const MachineOperand *&Destination) const override;
 
 private:
-  /// This is a helper for convertToThreeAddress for 16-bit instructions.
+  /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions.
   /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit
-  /// super-register and then truncating back down to a 16-bit sub-register.
+  /// super-register and then truncating back down to a 8/16-bit sub-register.
   MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
                                              MachineFunction::iterator &MFI,
                                              MachineInstr &MI,

Modified: llvm/trunk/test/CodeGen/X86/GlobalISel/add-scalar.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/GlobalISel/add-scalar.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/GlobalISel/add-scalar.ll (original)
+++ llvm/trunk/test/CodeGen/X86/GlobalISel/add-scalar.ll Wed Dec 12 09:58:27 2018
@@ -57,8 +57,9 @@ define i16 @test_add_i16(i16 %arg1, i16
 define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
 ; X64-LABEL: test_add_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal (%rsi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll (original)
+++ llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll Wed Dec 12 09:58:27 2018
@@ -11,8 +11,9 @@ define i16 @test_shl_i4(i16 %v, i16 %a,
 ; X64-LABEL: test_shl_i4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    addb %sil, %cl
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    leal (%rdx,%rsi), %ecx
 ; X64-NEXT:    andb $15, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shlb %cl, %al

Modified: llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar.ll (original)
+++ llvm/trunk/test/CodeGen/X86/GlobalISel/shl-scalar.ll Wed Dec 12 09:58:27 2018
@@ -147,8 +147,8 @@ define i8 @test_shl_i8_imm(i32 %arg1) {
 define i8 @test_shl_i8_imm1(i32 %arg1) {
 ; X64-LABEL: test_shl_i8_imm1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %al, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %a = trunc i32 %arg1 to i8

Modified: llvm/trunk/test/CodeGen/X86/fixup-bw-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fixup-bw-copy.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fixup-bw-copy.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fixup-bw-copy.ll Wed Dec 12 09:58:27 2018
@@ -43,9 +43,10 @@ define i16 @test_movw(i16 %a0) {
 define i8 @test_movb_hreg(i16 %a0) {
 ; X64-LABEL: test_movb_hreg:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrl $8, %eax
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    leal (%rax,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/fshr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fshr.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fshr.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fshr.ll Wed Dec 12 09:58:27 2018
@@ -358,9 +358,9 @@ define i8 @const_shift_i8(i8 %x, i8 %y)
 ;
 ; X64-LABEL: const_shift_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    shrb $7, %sil
-; X64-NEXT:    addb %al, %al
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    orb %sil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/iabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/iabs.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/iabs.ll (original)
+++ llvm/trunk/test/CodeGen/X86/iabs.ll Wed Dec 12 09:58:27 2018
@@ -21,10 +21,10 @@ define i8 @test_i8(i8 %a) nounwind {
 ;
 ; X64-LABEL: test_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    sarb $7, %cl
-; X64-NEXT:    addb %cl, %al
+; X64-NEXT:    leal (%rdi,%rcx), %eax
 ; X64-NEXT:    xorb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/mul-constant-i8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mul-constant-i8.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mul-constant-i8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mul-constant-i8.ll Wed Dec 12 09:58:27 2018
@@ -14,8 +14,8 @@ define i8 @test_mul_by_1(i8 %x) {
 define i8 @test_mul_by_2(i8 %x) {
 ; X64-LABEL: test_mul_by_2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %al, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 2

Modified: llvm/trunk/test/CodeGen/X86/popcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/popcnt.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/popcnt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/popcnt.ll Wed Dec 12 09:58:27 2018
@@ -25,6 +25,7 @@ define i8 @cnt8(i8 %x) nounwind readnone
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrb %al
 ; X64-NEXT:    andb $85, %al
@@ -36,8 +37,9 @@ define i8 @cnt8(i8 %x) nounwind readnone
 ; X64-NEXT:    addb %al, %dil
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    leal (%rax,%rdi), %eax
 ; X64-NEXT:    andb $15, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-POPCNT-LABEL: cnt8:

Modified: llvm/trunk/test/CodeGen/X86/pr23664.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr23664.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr23664.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr23664.ll Wed Dec 12 09:58:27 2018
@@ -4,8 +4,8 @@
 define i2 @f(i32 %arg) {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    addb %al, %al
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (%rdi,%rdi), %eax
 ; CHECK-NEXT:    orb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/rotate4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/rotate4.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/rotate4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/rotate4.ll Wed Dec 12 09:58:27 2018
@@ -642,9 +642,9 @@ define i32 @rotate_demanded_bits_3(i32,
 ;
 ; X64-LABEL: rotate_demanded_bits_3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    leal (%rsi,%rsi), %ecx
 ; X64-NEXT:    andb $30, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %eax

Modified: llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll?rev=348946&r1=348945&r2=348946&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll (original)
+++ llvm/trunk/test/CodeGen/X86/scheduler-backtracking.ll Wed Dec 12 09:58:27 2018
@@ -19,29 +19,28 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    incl %esi
 ; ILP-NEXT:    addb %sil, %sil
 ; ILP-NEXT:    orb $1, %sil
-; ILP-NEXT:    movl $1, %r9d
+; ILP-NEXT:    movl $1, %r10d
 ; ILP-NEXT:    xorl %r14d, %r14d
 ; ILP-NEXT:    movl %esi, %ecx
-; ILP-NEXT:    shldq %cl, %r9, %r14
+; ILP-NEXT:    shldq %cl, %r10, %r14
 ; ILP-NEXT:    movl $1, %edx
 ; ILP-NEXT:    shlq %cl, %rdx
-; ILP-NEXT:    movl %esi, %r11d
-; ILP-NEXT:    addb $-128, %r11b
-; ILP-NEXT:    movb $-128, %r10b
+; ILP-NEXT:    leal -128(%rsi), %r9d
+; ILP-NEXT:    movb $-128, %r11b
 ; ILP-NEXT:    xorl %ebx, %ebx
-; ILP-NEXT:    movl %r11d, %ecx
-; ILP-NEXT:    shldq %cl, %r9, %rbx
+; ILP-NEXT:    movl %r9d, %ecx
+; ILP-NEXT:    shldq %cl, %r10, %rbx
 ; ILP-NEXT:    testb $64, %sil
 ; ILP-NEXT:    cmovneq %rdx, %r14
 ; ILP-NEXT:    cmovneq %r8, %rdx
 ; ILP-NEXT:    movl $1, %edi
 ; ILP-NEXT:    shlq %cl, %rdi
-; ILP-NEXT:    subb %sil, %r10b
-; ILP-NEXT:    movl %r10d, %ecx
-; ILP-NEXT:    shrdq %cl, %r8, %r9
-; ILP-NEXT:    testb $64, %r10b
-; ILP-NEXT:    cmovneq %r8, %r9
+; ILP-NEXT:    subb %sil, %r11b
+; ILP-NEXT:    movl %r11d, %ecx
+; ILP-NEXT:    shrdq %cl, %r8, %r10
 ; ILP-NEXT:    testb $64, %r11b
+; ILP-NEXT:    cmovneq %r8, %r10
+; ILP-NEXT:    testb $64, %r9b
 ; ILP-NEXT:    cmovneq %rdi, %rbx
 ; ILP-NEXT:    cmovneq %r8, %rdi
 ; ILP-NEXT:    testb %sil, %sil
@@ -52,7 +51,7 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    cmovnsq %r8, %rbx
 ; ILP-NEXT:    cmoveq %r8, %rbx
 ; ILP-NEXT:    movq %rbx, 24(%rax)
-; ILP-NEXT:    cmovnsq %r9, %rdi
+; ILP-NEXT:    cmovnsq %r10, %rdi
 ; ILP-NEXT:    cmoveq %r8, %rdi
 ; ILP-NEXT:    movq %rdi, 16(%rax)
 ; ILP-NEXT:    popq %rbx
@@ -76,7 +75,7 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    xorl %r10d, %r10d
 ; HYBRID-NEXT:    movl %esi, %ecx
 ; HYBRID-NEXT:    shldq %cl, %r11, %r10
-; HYBRID-NEXT:    addb $-128, %cl
+; HYBRID-NEXT:    leal -128(%rsi), %ecx
 ; HYBRID-NEXT:    xorl %edi, %edi
 ; HYBRID-NEXT:    shldq %cl, %r11, %rdi
 ; HYBRID-NEXT:    movl $1, %edx
@@ -119,7 +118,7 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    xorl %r10d, %r10d
 ; BURR-NEXT:    movl %esi, %ecx
 ; BURR-NEXT:    shldq %cl, %r11, %r10
-; BURR-NEXT:    addb $-128, %cl
+; BURR-NEXT:    leal -128(%rsi), %ecx
 ; BURR-NEXT:    xorl %edi, %edi
 ; BURR-NEXT:    shldq %cl, %r11, %rdi
 ; BURR-NEXT:    movl $1, %edx
@@ -160,8 +159,7 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-NEXT:    shrdq %cl, %r8, %r10
 ; SRC-NEXT:    testb $64, %cl
 ; SRC-NEXT:    cmovneq %r8, %r10
-; SRC-NEXT:    movl %esi, %r9d
-; SRC-NEXT:    addb $-128, %r9b
+; SRC-NEXT:    leal -128(%rsi), %r9d
 ; SRC-NEXT:    xorl %edx, %edx
 ; SRC-NEXT:    movl %r9d, %ecx
 ; SRC-NEXT:    shldq %cl, %rdi, %rdx
@@ -215,13 +213,12 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    cmovneq %rdx, %rdi
 ; LIN-NEXT:    cmovsq %r9, %rdi
 ; LIN-NEXT:    movq %rdi, 8(%rax)
-; LIN-NEXT:    movl %esi, %edx
-; LIN-NEXT:    addb $-128, %dl
-; LIN-NEXT:    movl $1, %r10d
-; LIN-NEXT:    movl %edx, %ecx
-; LIN-NEXT:    shlq %cl, %r10
-; LIN-NEXT:    testb $64, %dl
-; LIN-NEXT:    movq %r10, %rdi
+; LIN-NEXT:    leal -128(%rsi), %r10d
+; LIN-NEXT:    movl $1, %edx
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shlq %cl, %rdx
+; LIN-NEXT:    testb $64, %r10b
+; LIN-NEXT:    movq %rdx, %rdi
 ; LIN-NEXT:    cmovneq %r9, %rdi
 ; LIN-NEXT:    movb $-128, %cl
 ; LIN-NEXT:    subb %sil, %cl
@@ -233,9 +230,9 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    cmoveq %r9, %rsi
 ; LIN-NEXT:    movq %rsi, 16(%rax)
 ; LIN-NEXT:    xorl %esi, %esi
-; LIN-NEXT:    movl %edx, %ecx
+; LIN-NEXT:    movl %r10d, %ecx
 ; LIN-NEXT:    shldq %cl, %r8, %rsi
-; LIN-NEXT:    cmovneq %r10, %rsi
+; LIN-NEXT:    cmovneq %rdx, %rsi
 ; LIN-NEXT:    cmovnsq %r9, %rsi
 ; LIN-NEXT:    cmoveq %r9, %rsi
 ; LIN-NEXT:    movq %rsi, 24(%rax)