[llvm] [DAG] Don't split f64 constant stores if the fp imm is legal (PR #74622)

Wed Dec 6 08:30:16 PST 2023

https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/74622

If the target can generate a specific fp immediate constant, then don't split the store into 2 x i32 stores

Another cleanup step for #74304

>From e6adfbc77bcf4c20703e714f5ecf5901f5ddea64 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 6 Dec 2023 16:28:26 +0000
Subject: [PATCH] [DAG] Don't split f64 constant stores if the fp imm is legal

If the target can generate a specific fp immediate constant, then don't split the store into 2 x i32 stores

Another cleanup step for #74304
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  3 +-
 llvm/test/CodeGen/ARM/aapcs-hfa-code.ll       | 12 +--
 llvm/test/CodeGen/ARM/ha-alignstack-call.ll   | 20 ++--
 llvm/test/CodeGen/Mips/pr49200.ll             |  9 +-
 llvm/test/CodeGen/X86/fp-intrinsics.ll        |  4 +-
 llvm/test/CodeGen/X86/ldexp.ll                |  7 +-
 llvm/test/CodeGen/X86/memset64-on-x86-32.ll   | 31 +++---
 llvm/test/CodeGen/X86/pr38738.ll              | 25 ++---
 llvm/test/CodeGen/X86/slow-unaligned-mem.ll   | 94 +++++++++++--------
 llvm/test/CodeGen/X86/zero-remat.ll           |  7 +-
 11 files changed, 107 insertions(+), 110 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2a3425a42607e..0c5b2894a2e76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20911,7 +20911,8 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
   // transform should not be done in this case.
 
   SDValue Tmp;
-  switch (CFP->getSimpleValueType(0).SimpleTy) {
+  MVT SimpleVT = CFP->getSimpleValueType(0);
+  switch (SimpleVT.SimpleTy) {
   default:
     llvm_unreachable("Unknown FP type");
   case MVT::f16:    // We don't do this for these yet.
@@ -20940,7 +20941,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
                           Ptr, ST->getMemOperand());
     }
 
-    if (ST->isSimple() &&
+    if (ST->isSimple() && !TLI.isFPImmLegal(CFP->getValueAPF(), SimpleVT) &&
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       // Many FP stores are not made apparent until after legalize, e.g. for
       // argument passing.  Since this is so common, custom legalize the
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7a54141fa711a..5e1f9fbcdde0a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -461,7 +461,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
                           ST->getOriginalAlign(), MMOFlags, AAInfo);
     }
 
-    if (CFP->getValueType(0) == MVT::f64) {
+    if (CFP->getValueType(0) == MVT::f64 &&
+        !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64)) {
       // If this target supports 64-bit registers, do a single 64-bit store.
       if (TLI.isTypeLegal(MVT::i64)) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
diff --git a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
index e32f19ef67452..dabbb1e38a86b 100644
--- a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
+++ b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -104,10 +104,8 @@ define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3
 ; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, sp, #8
-; CHECK-NEXT:    movw r1, #0
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    movt r1, #16368
-; CHECK-NEXT:    strd r0, r1, [sp]
+; CHECK-NEXT:    vmov.f64 d16, #1.000000e+00
+; CHECK-NEXT:    vstr d16, [sp]
 ; CHECK-NEXT:    bl test_1double_nosplit
 ; CHECK-NEXT:    add sp, sp, #8
 ; CHECK-NEXT:    pop {r11, pc}
@@ -138,10 +136,8 @@ define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double],
 ; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    movw r1, #0
-; CHECK-NEXT:    mov r0, #0
-; CHECK-NEXT:    movt r1, #16368
-; CHECK-NEXT:    strd r0, r1, [sp, #8]
+; CHECK-NEXT:    vmov.f64 d16, #1.000000e+00
+; CHECK-NEXT:    vstr d16, [sp, #8]
 ; CHECK-NEXT:    bl test_1double_misaligned
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    pop {r11, pc}
diff --git a/llvm/test/CodeGen/ARM/ha-alignstack-call.ll b/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
index e861fe397f849..7e2a911c89281 100644
--- a/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
+++ b/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
@@ -300,16 +300,16 @@ entry:
   ret double %call
 }
 ; CHECK-LABEL: g2_1_call:
-; CHECK:       movw   r0, #0
-; CHECK:       mov    r1, #0
-; CHECK:       movt   r0, #16352
-; CHECK:       str    r1, [sp]
-; CHECK:       stmib  sp, {r0, r1}
-; CHECK:       str    r1, [sp, #12]
-; CHECK:       str    r1, [sp, #16]
-; CHECK:       str    r1, [sp, #20]
-; CHECK:       str    r1, [sp, #24]
-; CHECK:       str    r1, [sp, #28]
+; CHECK:       vmov.f64 d16, #5.000000e-01
+; CHECK:       mov    r0, #0
+; CHECK:       str    r0, [sp, #8]
+; CHECK:       str    r0, [sp, #12]
+; CHECK:       str    r0, [sp, #16]
+; CHECK:       vmov.i32 d0, #0x0
+; CHECK:       str    r0, [sp, #20]
+; CHECK:       str    r0, [sp, #24]
+; CHECK:       str    r0, [sp, #28]
+; CHECK:       vstr   d16, [sp]
 ; CHECK:       bl    g2_1
 
 ; pass in memory, alignment 8
diff --git a/llvm/test/CodeGen/Mips/pr49200.ll b/llvm/test/CodeGen/Mips/pr49200.ll
index 80a2bdd4e95ee..2a9f207b29e58 100644
--- a/llvm/test/CodeGen/Mips/pr49200.ll
+++ b/llvm/test/CodeGen/Mips/pr49200.ll
@@ -11,11 +11,10 @@ define dso_local void @foo() #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addiusp -24
-; CHECK-NEXT:    li16 $2, 0
-; CHECK-NEXT:    sw $2, 4($sp)
-; CHECK-NEXT:    sw $2, 0($sp)
-; CHECK-NEXT:    sw $2, 12($sp)
-; CHECK-NEXT:    sw $2, 8($sp)
+; CHECK-NEXT:    mtc1 $zero, $f0
+; CHECK-NEXT:    mthc1 $zero, $f0
+; CHECK-NEXT:    sdc1 $f0, 0($sp)
+; CHECK-NEXT:    sdc1 $f0, 8($sp)
 ; CHECK-NEXT:    ldc1 $f0, 0($sp)
 ; CHECK-NEXT:    sdc1 $f0, 16($sp)
 ; CHECK-NEXT:    addiusp 24
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 32e45adcb94d7..5f77e2cb46cbf 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -865,9 +865,9 @@ define double @f19() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 32
 ; X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    fld1
+; X87-NEXT:    fstpl (%esp)
 ; X87-NEXT:    wait
-; X87-NEXT:    movl $1072693248, {{[0-9]+}}(%esp) # imm = 0x3FF00000
-; X87-NEXT:    movl $0, (%esp)
 ; X87-NEXT:    calll fmod
 ; X87-NEXT:    addl $28, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll
index 44c57c54ba023..ec128fc6686c8 100644
--- a/llvm/test/CodeGen/X86/ldexp.ll
+++ b/llvm/test/CodeGen/X86/ldexp.ll
@@ -91,10 +91,11 @@ define double @ldexp_f64(i8 zeroext %x) {
 ;
 ; WIN32-LABEL: ldexp_f64:
 ; WIN32:       # %bb.0:
+; WIN32-NEXT:    subl $12, %esp
 ; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    pushl $1072693248 # imm = 0x3FF00000
-; WIN32-NEXT:    pushl $0
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fld1
+; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
index c6eecdcdf99cc..480a0970bd39d 100644
--- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -18,26 +18,17 @@ define void @bork(ptr nocapture align 4 %dst) nounwind {
 ; SLOW_32-LABEL: bork:
 ; SLOW_32:       # %bb.0:
 ; SLOW_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SLOW_32-NEXT:    movl $0, 4(%eax)
-; SLOW_32-NEXT:    movl $0, (%eax)
-; SLOW_32-NEXT:    movl $0, 12(%eax)
-; SLOW_32-NEXT:    movl $0, 8(%eax)
-; SLOW_32-NEXT:    movl $0, 20(%eax)
-; SLOW_32-NEXT:    movl $0, 16(%eax)
-; SLOW_32-NEXT:    movl $0, 28(%eax)
-; SLOW_32-NEXT:    movl $0, 24(%eax)
-; SLOW_32-NEXT:    movl $0, 36(%eax)
-; SLOW_32-NEXT:    movl $0, 32(%eax)
-; SLOW_32-NEXT:    movl $0, 44(%eax)
-; SLOW_32-NEXT:    movl $0, 40(%eax)
-; SLOW_32-NEXT:    movl $0, 52(%eax)
-; SLOW_32-NEXT:    movl $0, 48(%eax)
-; SLOW_32-NEXT:    movl $0, 60(%eax)
-; SLOW_32-NEXT:    movl $0, 56(%eax)
-; SLOW_32-NEXT:    movl $0, 68(%eax)
-; SLOW_32-NEXT:    movl $0, 64(%eax)
-; SLOW_32-NEXT:    movl $0, 76(%eax)
-; SLOW_32-NEXT:    movl $0, 72(%eax)
+; SLOW_32-NEXT:    xorps %xmm0, %xmm0
+; SLOW_32-NEXT:    movsd %xmm0, 72(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 64(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 56(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 48(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 40(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 32(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 24(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 16(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, 8(%eax)
+; SLOW_32-NEXT:    movsd %xmm0, (%eax)
 ; SLOW_32-NEXT:    retl
 ;
 ; SLOW_64-LABEL: bork:
diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll
index 753b7ce33d2be..205849e7d05db 100644
--- a/llvm/test/CodeGen/X86/pr38738.ll
+++ b/llvm/test/CodeGen/X86/pr38738.ll
@@ -130,22 +130,15 @@ define void @tryset(ptr nocapture %x) {
 ; X86SSE2-LABEL: tryset:
 ; X86SSE2:       # %bb.0:
 ; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86SSE2-NEXT:    movl $0, 4(%eax)
-; X86SSE2-NEXT:    movl $0, (%eax)
-; X86SSE2-NEXT:    movl $0, 12(%eax)
-; X86SSE2-NEXT:    movl $0, 8(%eax)
-; X86SSE2-NEXT:    movl $0, 20(%eax)
-; X86SSE2-NEXT:    movl $0, 16(%eax)
-; X86SSE2-NEXT:    movl $0, 28(%eax)
-; X86SSE2-NEXT:    movl $0, 24(%eax)
-; X86SSE2-NEXT:    movl $0, 36(%eax)
-; X86SSE2-NEXT:    movl $0, 32(%eax)
-; X86SSE2-NEXT:    movl $0, 44(%eax)
-; X86SSE2-NEXT:    movl $0, 40(%eax)
-; X86SSE2-NEXT:    movl $0, 52(%eax)
-; X86SSE2-NEXT:    movl $0, 48(%eax)
-; X86SSE2-NEXT:    movl $0, 60(%eax)
-; X86SSE2-NEXT:    movl $0, 56(%eax)
+; X86SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86SSE2-NEXT:    movsd %xmm0, 56(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, 48(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, 40(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, 32(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, 24(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, 16(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, 8(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, (%eax)
 ; X86SSE2-NEXT:    retl
 ;
 ; X64AVX-LABEL: tryset:
diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
index 85afa83e3f08e..d74d195439bda 100644
--- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -1,16 +1,16 @@
 ; Intel chips with slow unaligned memory accesses
 
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3      2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m     2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m     2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4      2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m     2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah         2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott      2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona        2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2         2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn        2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell       2>&1 | FileCheck %s --check-prefixes=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah         2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona        2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2         2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn        2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell       2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
 
 ; Intel chips with fast unaligned memory accesses
 
@@ -26,15 +26,15 @@
 
 ; AMD chips with slow unaligned memory accesses
 
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4      2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp     2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8            2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron       2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64      2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx     2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3       2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3  2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8            2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron       2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3       2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3  2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
 
 ; AMD chips with fast unaligned memory accesses
 
@@ -67,26 +67,40 @@
 ; SLOW-NOT: not a recognized processor
 ; FAST-NOT: not a recognized processor
 define void @store_zeros(ptr %a) {
-; SLOW-LABEL: store_zeros:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NEXT:    movl $0
-; SLOW-NOT:     movl
+; SLOW-SCALAR-LABEL: store_zeros:
+; SLOW-SCALAR:       # %bb.0:
+; SLOW-SCALAR-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NEXT:    movl $0
+; SLOW-SCALAR-NOT:     movl
+;
+; SLOW-SSE-LABEL: store_zeros:
+; SLOW-SSE:       # %bb.0:
+; SLOW-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SLOW-SSE-NEXT:    xorps %xmm0, %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NEXT:    movsd %xmm0
+; SLOW-SSE-NOT:     movsd
 ;
 ; FAST-SSE-LABEL: store_zeros:
 ; FAST-SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/zero-remat.ll b/llvm/test/CodeGen/X86/zero-remat.ll
index 60bb2c420cda4..000e0d14b711f 100644
--- a/llvm/test/CodeGen/X86/zero-remat.ll
+++ b/llvm/test/CodeGen/X86/zero-remat.ll
@@ -19,11 +19,12 @@ define double @foo() nounwind {
 ;
 ; CHECK-32-LABEL: foo:
 ; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    pushl $0
-; CHECK-32-NEXT:    pushl $0
+; CHECK-32-NEXT:    subl $8, %esp
+; CHECK-32-NEXT:    fldz
+; CHECK-32-NEXT:    fstpl (%esp)
 ; CHECK-32-NEXT:    calll bar at PLT
-; CHECK-32-NEXT:    addl $8, %esp
 ; CHECK-32-NEXT:    fldz
+; CHECK-32-NEXT:    addl $8, %esp
 ; CHECK-32-NEXT:    retl
   call void @bar(double 0.0)
   ret double 0.0