[llvm] r278466 - Recommit 'Remove the restriction that MachineSinking is now stopped by
Wei Mi via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 11 20:33:22 PDT 2016
Author: wmi
Date: Thu Aug 11 22:33:22 2016
New Revision: 278466
URL: http://llvm.org/viewvc/llvm-project?rev=278466&view=rev
Log:
Recommit 'Remove the restriction that MachineSinking is now stopped by
"insert_subreg, subreg_to_reg, and reg_sequence" instructions' after
adjusting some unittest checks.
This is to solve PR28852. The restriction was added at 2010 to make better register
coalescing. We assumed that it was not necessary any more. Testing results on x86
supported the assumption.
We will look closely to any performance impact it will bring and will be prepared
to help analyzing performance problem found on other architectures.
Differential Revision: https://reviews.llvm.org/D23210
Added:
llvm/trunk/test/CodeGen/X86/MachineSink-SubReg.ll
Modified:
llvm/trunk/include/llvm/Target/TargetInstrInfo.h
llvm/trunk/test/CodeGen/AArch64/atomic-ops.ll
llvm/trunk/test/CodeGen/ARM/2012-08-30-select.ll
llvm/trunk/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
llvm/trunk/test/CodeGen/X86/clz.ll
llvm/trunk/test/CodeGen/X86/half.ll
llvm/trunk/test/CodeGen/X86/machine-cse.ll
llvm/trunk/test/CodeGen/X86/uint64-to-float.ll
llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
Modified: llvm/trunk/include/llvm/Target/TargetInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetInstrInfo.h?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetInstrInfo.h (original)
+++ llvm/trunk/include/llvm/Target/TargetInstrInfo.h Thu Aug 11 22:33:22 2016
@@ -270,11 +270,8 @@ public:
/// MachineSink determines on its own whether the instruction is safe to sink;
/// this gives the target a hook to override the default behavior with regards
/// to which instructions should be sunk.
- /// The default behavior is to not sink insert_subreg, subreg_to_reg, and
- /// reg_sequence. These are meant to be close to the source to make it easier
- /// to coalesce.
virtual bool shouldSink(const MachineInstr &MI) const {
- return !MI.isInsertSubreg() && !MI.isSubregToReg() && !MI.isRegSequence();
+ return true;
}
/// Re-issue the specified 'original' instruction at the
Modified: llvm/trunk/test/CodeGen/AArch64/atomic-ops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/atomic-ops.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/atomic-ops.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/atomic-ops.ll Thu Aug 11 22:33:22 2016
@@ -452,20 +452,19 @@ define i16 @test_atomic_load_xchg_i16(i1
define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
; CHECK-LABEL: test_atomic_load_xchg_i32:
+; CHECK: mov {{[xw]}}8, w[[OLD:[0-9]+]]
%old = atomicrmw xchg i32* @var32, i32 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr {{[xw]}}[[OLD]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
+; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w8, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
Modified: llvm/trunk/test/CodeGen/ARM/2012-08-30-select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/2012-08-30-select.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/2012-08-30-select.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/2012-08-30-select.ll Thu Aug 11 22:33:22 2016
@@ -2,8 +2,10 @@
; rdar://12201387
;CHECK-LABEL: select_s_v_v:
-;CHECK: it ne
+;CHECK: itee ne
;CHECK-NEXT: vmovne.i32
+;CHECK-NEXT: vmoveq
+;CHECK-NEXT: vmoveq
;CHECK: bx
define <16 x i8> @select_s_v_v(<16 x i8> %vec, i32 %avail) {
entry:
Modified: llvm/trunk/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2009-04-25-CoalescerBug.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2009-04-25-CoalescerBug.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2009-04-25-CoalescerBug.ll Thu Aug 11 22:33:22 2016
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep mov | count 2
+; RUN: llc < %s -march=x86-64 | grep mov | count 1
; rdar://6806252
define i64 @test(i32* %tmp13) nounwind {
Added: llvm/trunk/test/CodeGen/X86/MachineSink-SubReg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/MachineSink-SubReg.ll?rev=278466&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/MachineSink-SubReg.ll (added)
+++ llvm/trunk/test/CodeGen/X86/MachineSink-SubReg.ll Thu Aug 11 22:33:22 2016
@@ -0,0 +1,37 @@
+; PR28852: Check machine code sinking is not stopped by SUBREG_TO_REG.
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: foo
+; CHECK-NOT: imull
+; CHECK: retq
+; CHECK: imull
+
+define void @foo(i64 %value, i32 %kLengthBits, i32* nocapture %bits, i64* nocapture %bit_buffer_64, i32 %x) local_unnamed_addr {
+entry:
+ %mul = mul i32 %x, %kLengthBits
+ %add = add i32 %mul, 3
+ %conv = zext i32 %add to i64
+ %mul2 = mul nuw nsw i64 %conv, 5
+ %sub = sub i64 64, %value
+ %conv4 = trunc i64 %sub to i32
+ %tmp0 = load i32, i32* %bits, align 4
+ %cmp = icmp ult i32 %tmp0, %conv4
+ br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then: ; preds = %entry
+ %add7 = add i64 %mul2, %value
+ %tmp1 = load i64, i64* %bit_buffer_64, align 8
+ %add8 = add i64 %add7, %tmp1
+ store i64 %add8, i64* %bit_buffer_64, align 8
+ %conv9 = trunc i64 %mul2 to i32
+ store i32 %conv9, i32* %bits, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 2000}
Modified: llvm/trunk/test/CodeGen/X86/clz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clz.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/clz.ll (original)
+++ llvm/trunk/test/CodeGen/X86/clz.ll Thu Aug 11 22:33:22 2016
@@ -427,13 +427,14 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
;
; X64-LABEL: ctlz_i64_zero_test:
; X64: # BB#0:
-; X64-NEXT: movl $64, %eax
; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: je .LBB11_2
-; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: je .LBB11_1
+; X64-NEXT: # BB#2: # %cond.false
; X64-NEXT: bsrq %rdi, %rax
; X64-NEXT: xorq $63, %rax
-; X64-NEXT: .LBB11_2: # %cond.end
+; X64-NEXT: retq
+; X64-NEXT: .LBB11_1:
+; X64-NEXT: movl $64, %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i64_zero_test:
@@ -601,12 +602,13 @@ define i64 @cttz_i64_zero_test(i64 %n) {
;
; X64-LABEL: cttz_i64_zero_test:
; X64: # BB#0:
-; X64-NEXT: movl $64, %eax
; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: je .LBB15_2
-; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: je .LBB15_1
+; X64-NEXT: # BB#2: # %cond.false
; X64-NEXT: bsfq %rdi, %rax
-; X64-NEXT: .LBB15_2: # %cond.end
+; X64-NEXT: retq
+; X64-NEXT: .LBB15_1:
+; X64-NEXT: movl $64, %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i64_zero_test:
Modified: llvm/trunk/test/CodeGen/X86/half.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/half.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/half.ll (original)
+++ llvm/trunk/test/CodeGen/X86/half.ll Thu Aug 11 22:33:22 2016
@@ -157,8 +157,6 @@ define void @test_uitofp_i64(i64 %a, hal
; CHECK-LABEL: test_uitofp_i64:
; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z0-9]+]]
; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-NEXT: movl %edi, [[REG0:%[a-z0-9]+]]
-; CHECK-NEXT: andl $1, [[REG0]]
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]]
@@ -169,8 +167,10 @@ define void @test_uitofp_i64(i64 %a, hal
; convert using shift+or if negative
; CHECK-NEXT: [[LABEL1]]:
-; CHECK-NEXT: shrq %rdi
-; CHECK-NEXT: orq %rdi, [[REG2:%[a-z0-9]+]]
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]]
; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]]
; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]]
; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]]
Modified: llvm/trunk/test/CodeGen/X86/machine-cse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-cse.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-cse.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-cse.ll Thu Aug 11 22:33:22 2016
@@ -53,15 +53,15 @@ entry:
sw.bb: ; preds = %entry, %entry, %entry
; CHECK: %sw.bb
-; CHECK: imull
+; CHECK-NOT: imull
%mul = mul nsw i32 %test_case, 3
%mul20 = mul nsw i32 %mul, %scale
br i1 undef, label %if.end34, label %sw.bb307
if.end34: ; preds = %sw.bb
; CHECK: %if.end34
+; CHECK: imull
; CHECK: leal
-; CHECK-NOT: imull
tail call void (...) @printf(i32 %test_case, i32 %mul20) nounwind
%tmp = mul i32 %scale, %test_case
%tmp752 = mul i32 %tmp, 3
@@ -104,12 +104,13 @@ return:
; rdar://11393714
define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp {
; CHECK: %entry
-; CHECK: xorl
+; CHECK-NOT: xorl
; CHECK: %preheader
+; CHECK-NOT: xorl
; CHECK: %do.body
; CHECK-NOT: xorl
; CHECK: %do.cond
-; CHECK-NOT: xorl
+; CHECK: xorl
; CHECK: %return
entry:
%cmp = icmp eq i64 %n, 0
Modified: llvm/trunk/test/CodeGen/X86/uint64-to-float.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/uint64-to-float.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/uint64-to-float.ll (original)
+++ llvm/trunk/test/CodeGen/X86/uint64-to-float.ll Thu Aug 11 22:33:22 2016
@@ -6,13 +6,15 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
-; CHECK: andl
+; CHECK: %entry
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: js LBB0_1
; CHECK: cvtsi2ss
; CHECK-NEXT: ret
; CHECK: LBB0_1
-; CHECK: shrq
+; CHECK: movq
+; CHECK-NEXT: shrq
+; CHECK-NEXT: andl
; CHECK-NEXT: orq
; CHECK-NEXT: cvtsi2ss
define float @test(i64 %a) {
Modified: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll?rev=278466&r1=278465&r2=278466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll Thu Aug 11 22:33:22 2016
@@ -1325,8 +1325,6 @@ define <4 x float> @uitofp_2i64_to_4f32(
; SSE: # BB#0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB38_1
; SSE-NEXT: # BB#2:
@@ -1334,16 +1332,16 @@ define <4 x float> @uitofp_2i64_to_4f32(
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB38_3
; SSE-NEXT: .LBB38_1:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB38_3:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB38_4
; SSE-NEXT: # BB#5:
@@ -1352,10 +1350,12 @@ define <4 x float> @uitofp_2i64_to_4f32(
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
; SSE-NEXT: .LBB38_4:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
@@ -1363,31 +1363,31 @@ define <4 x float> @uitofp_2i64_to_4f32(
; VEX-LABEL: uitofp_2i64_to_4f32:
; VEX: # BB#0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: movl %eax, %ecx
-; VEX-NEXT: andl $1, %ecx
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB38_1
; VEX-NEXT: # BB#2:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: jmp .LBB38_3
; VEX-NEXT: .LBB38_1:
-; VEX-NEXT: shrq %rax
-; VEX-NEXT: orq %rax, %rcx
-; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; VEX-NEXT: movq %rax, %rcx
+; VEX-NEXT: shrq %rcx
+; VEX-NEXT: andl $1, %eax
+; VEX-NEXT: orq %rcx, %rax
+; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
; VEX-NEXT: .LBB38_3:
; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: movl %eax, %ecx
-; VEX-NEXT: andl $1, %ecx
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB38_4
; VEX-NEXT: # BB#5:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: jmp .LBB38_6
; VEX-NEXT: .LBB38_4:
-; VEX-NEXT: shrq %rax
-; VEX-NEXT: orq %rax, %rcx
-; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0
+; VEX-NEXT: movq %rax, %rcx
+; VEX-NEXT: shrq %rcx
+; VEX-NEXT: andl $1, %eax
+; VEX-NEXT: orq %rcx, %rax
+; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
; VEX-NEXT: .LBB38_6:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
@@ -1429,8 +1429,6 @@ define <4 x float> @uitofp_4i64_to_4f32_
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: .LBB39_2:
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB39_3
; SSE-NEXT: # BB#4:
@@ -1438,17 +1436,17 @@ define <4 x float> @uitofp_4i64_to_4f32_
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB39_5
; SSE-NEXT: .LBB39_3:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB39_5:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB39_6
; SSE-NEXT: # BB#7:
@@ -1456,10 +1454,12 @@ define <4 x float> @uitofp_4i64_to_4f32_
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB39_8
; SSE-NEXT: .LBB39_6:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB39_8:
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -1469,31 +1469,31 @@ define <4 x float> @uitofp_4i64_to_4f32_
; VEX-LABEL: uitofp_4i64_to_4f32_undef:
; VEX: # BB#0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: movl %eax, %ecx
-; VEX-NEXT: andl $1, %ecx
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB39_1
; VEX-NEXT: # BB#2:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: jmp .LBB39_3
; VEX-NEXT: .LBB39_1:
-; VEX-NEXT: shrq %rax
-; VEX-NEXT: orq %rax, %rcx
-; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; VEX-NEXT: movq %rax, %rcx
+; VEX-NEXT: shrq %rcx
+; VEX-NEXT: andl $1, %eax
+; VEX-NEXT: orq %rcx, %rax
+; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
; VEX-NEXT: .LBB39_3:
; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: movl %eax, %ecx
-; VEX-NEXT: andl $1, %ecx
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB39_4
; VEX-NEXT: # BB#5:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: jmp .LBB39_6
; VEX-NEXT: .LBB39_4:
-; VEX-NEXT: shrq %rax
-; VEX-NEXT: orq %rax, %rcx
-; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0
+; VEX-NEXT: movq %rax, %rcx
+; VEX-NEXT: shrq %rcx
+; VEX-NEXT: andl $1, %eax
+; VEX-NEXT: orq %rcx, %rax
+; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
; VEX-NEXT: .LBB39_6:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
@@ -1694,37 +1694,35 @@ define <4 x float> @uitofp_4i64_to_4f32(
; SSE-LABEL: uitofp_4i64_to_4f32:
; SSE: # BB#0:
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB45_1
; SSE-NEXT: # BB#2:
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB45_3
; SSE-NEXT: .LBB45_1:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: addss %xmm3, %xmm3
; SSE-NEXT: .LBB45_3:
; SSE-NEXT: movd %xmm0, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB45_4
; SSE-NEXT: # BB#5:
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB45_6
; SSE-NEXT: .LBB45_4:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB45_6:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB45_7
; SSE-NEXT: # BB#8:
@@ -1732,17 +1730,17 @@ define <4 x float> @uitofp_4i64_to_4f32(
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB45_9
; SSE-NEXT: .LBB45_7:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB45_9:
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: movd %xmm0, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB45_10
; SSE-NEXT: # BB#11:
@@ -1750,10 +1748,12 @@ define <4 x float> @uitofp_4i64_to_4f32(
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB45_12
; SSE-NEXT: .LBB45_10:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB45_12:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1764,53 +1764,51 @@ define <4 x float> @uitofp_4i64_to_4f32(
; AVX1-LABEL: uitofp_4i64_to_4f32:
; AVX1: # BB#0:
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_1
; AVX1-NEXT: # BB#2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB45_3
; AVX1-NEXT: .LBB45_1:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB45_3:
; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_4
; AVX1-NEXT: # BB#5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: jmp .LBB45_6
; AVX1-NEXT: .LBB45_4:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB45_6:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_7
; AVX1-NEXT: # BB#8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: jmp .LBB45_9
; AVX1-NEXT: .LBB45_7:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB45_9:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB45_10
; AVX1-NEXT: # BB#11:
@@ -1819,9 +1817,11 @@ define <4 x float> @uitofp_4i64_to_4f32(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB45_10:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
@@ -1830,53 +1830,51 @@ define <4 x float> @uitofp_4i64_to_4f32(
; AVX2-LABEL: uitofp_4i64_to_4f32:
; AVX2: # BB#0:
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_1
; AVX2-NEXT: # BB#2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB45_3
; AVX2-NEXT: .LBB45_1:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB45_3:
; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_4
; AVX2-NEXT: # BB#5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: jmp .LBB45_6
; AVX2-NEXT: .LBB45_4:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB45_6:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_7
; AVX2-NEXT: # BB#8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: jmp .LBB45_9
; AVX2-NEXT: .LBB45_7:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB45_9:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB45_10
; AVX2-NEXT: # BB#11:
@@ -1885,9 +1883,11 @@ define <4 x float> @uitofp_4i64_to_4f32(
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-NEXT: .LBB45_10:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
@@ -3083,37 +3083,35 @@ define <4 x float> @uitofp_load_4i64_to_
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm3
; SSE-NEXT: movd %xmm3, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB74_1
; SSE-NEXT: # BB#2:
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB74_3
; SSE-NEXT: .LBB74_1:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB74_3:
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB74_4
; SSE-NEXT: # BB#5:
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB74_6
; SSE-NEXT: .LBB74_4:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB74_6:
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; SSE-NEXT: movd %xmm3, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB74_7
; SSE-NEXT: # BB#8:
@@ -3121,17 +3119,17 @@ define <4 x float> @uitofp_load_4i64_to_
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB74_9
; SSE-NEXT: .LBB74_7:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm3, %xmm3
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: addss %xmm3, %xmm3
; SSE-NEXT: .LBB74_9:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB74_10
; SSE-NEXT: # BB#11:
@@ -3139,10 +3137,12 @@ define <4 x float> @uitofp_load_4i64_to_
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB74_12
; SSE-NEXT: .LBB74_10:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB74_12:
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
@@ -3153,53 +3153,51 @@ define <4 x float> @uitofp_load_4i64_to_
; AVX1: # BB#0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_1
; AVX1-NEXT: # BB#2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB74_3
; AVX1-NEXT: .LBB74_1:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB74_3:
; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_4
; AVX1-NEXT: # BB#5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: jmp .LBB74_6
; AVX1-NEXT: .LBB74_4:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB74_6:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_7
; AVX1-NEXT: # BB#8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: jmp .LBB74_9
; AVX1-NEXT: .LBB74_7:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB74_9:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB74_10
; AVX1-NEXT: # BB#11:
@@ -3208,9 +3206,11 @@ define <4 x float> @uitofp_load_4i64_to_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB74_10:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
@@ -3220,53 +3220,51 @@ define <4 x float> @uitofp_load_4i64_to_
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_1
; AVX2-NEXT: # BB#2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB74_3
; AVX2-NEXT: .LBB74_1:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB74_3:
; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_4
; AVX2-NEXT: # BB#5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: jmp .LBB74_6
; AVX2-NEXT: .LBB74_4:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB74_6:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_7
; AVX2-NEXT: # BB#8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: jmp .LBB74_9
; AVX2-NEXT: .LBB74_7:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB74_9:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB74_10
; AVX2-NEXT: # BB#11:
@@ -3275,9 +3273,11 @@ define <4 x float> @uitofp_load_4i64_to_
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-NEXT: .LBB74_10:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
@@ -3408,52 +3408,50 @@ define <8 x float> @uitofp_load_8i64_to_
; SSE-NEXT: movdqa 32(%rdi), %xmm2
; SSE-NEXT: movdqa 48(%rdi), %xmm3
; SSE-NEXT: movd %xmm5, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_1
; SSE-NEXT: # BB#2:
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
; SSE-NEXT: jmp .LBB78_3
; SSE-NEXT: .LBB78_1:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm4
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm4
; SSE-NEXT: addss %xmm4, %xmm4
; SSE-NEXT: .LBB78_3:
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_4
; SSE-NEXT: # BB#5:
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB78_6
; SSE-NEXT: .LBB78_4:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB78_6:
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
; SSE-NEXT: movd %xmm5, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_7
; SSE-NEXT: # BB#8:
; SSE-NEXT: cvtsi2ssq %rax, %xmm6
; SSE-NEXT: jmp .LBB78_9
; SSE-NEXT: .LBB78_7:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm6
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm6
; SSE-NEXT: addss %xmm6, %xmm6
; SSE-NEXT: .LBB78_9:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: movd %xmm1, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_10
; SSE-NEXT: # BB#11:
@@ -3461,29 +3459,29 @@ define <8 x float> @uitofp_load_8i64_to_
; SSE-NEXT: cvtsi2ssq %rax, %xmm5
; SSE-NEXT: jmp .LBB78_12
; SSE-NEXT: .LBB78_10:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm5, %xmm5
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm5
+; SSE-NEXT: cvtsi2ssq %rax, %xmm5
; SSE-NEXT: addss %xmm5, %xmm5
; SSE-NEXT: .LBB78_12:
; SSE-NEXT: movd %xmm3, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_13
; SSE-NEXT: # BB#14:
; SSE-NEXT: cvtsi2ssq %rax, %xmm7
; SSE-NEXT: jmp .LBB78_15
; SSE-NEXT: .LBB78_13:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm7
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm7
; SSE-NEXT: addss %xmm7, %xmm7
; SSE-NEXT: .LBB78_15:
; SSE-NEXT: movd %xmm2, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_16
; SSE-NEXT: # BB#17:
@@ -3491,18 +3489,18 @@ define <8 x float> @uitofp_load_8i64_to_
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB78_18
; SSE-NEXT: .LBB78_16:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB78_18:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; SSE-NEXT: movd %xmm3, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_19
; SSE-NEXT: # BB#20:
@@ -3510,18 +3508,18 @@ define <8 x float> @uitofp_load_8i64_to_
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB78_21
; SSE-NEXT: .LBB78_19:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm3, %xmm3
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: addss %xmm3, %xmm3
; SSE-NEXT: .LBB78_21:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE-NEXT: movd %xmm2, %rax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB78_22
; SSE-NEXT: # BB#23:
@@ -3529,10 +3527,12 @@ define <8 x float> @uitofp_load_8i64_to_
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB78_24
; SSE-NEXT: .LBB78_22:
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB78_24:
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -3544,122 +3544,122 @@ define <8 x float> @uitofp_load_8i64_to_
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_1
; AVX1-NEXT: # BB#2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB78_3
; AVX1-NEXT: .LBB78_1:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB78_3:
; AVX1-NEXT: vmovq %xmm2, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_4
; AVX1-NEXT: # BB#5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX1-NEXT: jmp .LBB78_6
; AVX1-NEXT: .LBB78_4:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX1-NEXT: .LBB78_6:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vmovq %xmm2, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_7
; AVX1-NEXT: # BB#8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX1-NEXT: jmp .LBB78_9
; AVX1-NEXT: .LBB78_7:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX1-NEXT: .LBB78_9:
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_10
; AVX1-NEXT: # BB#11:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX1-NEXT: jmp .LBB78_12
; AVX1-NEXT: .LBB78_10:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB78_12:
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_13
; AVX1-NEXT: # BB#14:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX1-NEXT: jmp .LBB78_15
; AVX1-NEXT: .LBB78_13:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX1-NEXT: .LBB78_15:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_16
; AVX1-NEXT: # BB#17:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX1-NEXT: jmp .LBB78_18
; AVX1-NEXT: .LBB78_16:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX1-NEXT: .LBB78_18:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vmovq %xmm4, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_19
; AVX1-NEXT: # BB#20:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
; AVX1-NEXT: jmp .LBB78_21
; AVX1-NEXT: .LBB78_19:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
; AVX1-NEXT: .LBB78_21:
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
; AVX1-NEXT: vpextrq $1, %xmm4, %rax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB78_22
; AVX1-NEXT: # BB#23:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX1-NEXT: jmp .LBB78_24
; AVX1-NEXT: .LBB78_22:
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: orq %rax, %rcx
-; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB78_24:
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
@@ -3671,122 +3671,122 @@ define <8 x float> @uitofp_load_8i64_to_
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_1
; AVX2-NEXT: # BB#2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB78_3
; AVX2-NEXT: .LBB78_1:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB78_3:
; AVX2-NEXT: vmovq %xmm2, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_4
; AVX2-NEXT: # BB#5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX2-NEXT: jmp .LBB78_6
; AVX2-NEXT: .LBB78_4:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX2-NEXT: .LBB78_6:
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vmovq %xmm2, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_7
; AVX2-NEXT: # BB#8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX2-NEXT: jmp .LBB78_9
; AVX2-NEXT: .LBB78_7:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX2-NEXT: .LBB78_9:
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_10
; AVX2-NEXT: # BB#11:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX2-NEXT: jmp .LBB78_12
; AVX2-NEXT: .LBB78_10:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB78_12:
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_13
; AVX2-NEXT: # BB#14:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX2-NEXT: jmp .LBB78_15
; AVX2-NEXT: .LBB78_13:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX2-NEXT: .LBB78_15:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_16
; AVX2-NEXT: # BB#17:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX2-NEXT: jmp .LBB78_18
; AVX2-NEXT: .LBB78_16:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX2-NEXT: .LBB78_18:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vmovq %xmm4, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_19
; AVX2-NEXT: # BB#20:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
; AVX2-NEXT: jmp .LBB78_21
; AVX2-NEXT: .LBB78_19:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
; AVX2-NEXT: .LBB78_21:
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
; AVX2-NEXT: vpextrq $1, %xmm4, %rax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB78_22
; AVX2-NEXT: # BB#23:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX2-NEXT: jmp .LBB78_24
; AVX2-NEXT: .LBB78_22:
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-NEXT: .LBB78_24:
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
More information about the llvm-commits
mailing list