[llvm] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize FABS/FNEG/FCOPYSIGN (f16) with Expand (PR #106153)

Tue Aug 27 18:49:45 PDT 2024

https://github.com/v01dXYZ updated https://github.com/llvm/llvm-project/pull/106153

>From 02f912a92087318eb940040bbdb737f9f6b18c34 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Wed, 28 Aug 2024 00:45:57 +0200
Subject: [PATCH 1/2] [X86] test pre-commit: Legalise FNEG/FCOPYSIGN (f16) with
 Expand

---
 llvm/test/CodeGen/X86/fp16-libcalls.ll | 98 ++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index db3d031a8fe3fb..bd1dc47efa41ff 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -175,6 +175,104 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
   ret void
 }
 
+define void @test_half_fneg(half %a0, ptr %p0) nounwind {
+; F16C-LABEL: test_half_fneg:
+; F16C:       # %bb.0:
+; F16C-NEXT:    vpextrw $0, %xmm0, %eax
+; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_fneg:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FP16-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; FP16-NEXT:    vmovsh %xmm0, (%rdi)
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_fneg:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    callq __truncsfhf2 at PLT
+; X64-NEXT:    pextrw $0, %xmm0, %eax
+; X64-NEXT:    movw %ax, (%rbx)
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_fneg:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movd %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    pextrw $0, %xmm0, %eax
+; X86-NEXT:    movw %ax, (%esi)
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %res = fneg half %a0
+  store half %res, ptr %p0, align 2
+  ret void
+}
+
+define void @test_half_fcopysign(half %a0, half %a1, ptr %p0) nounwind {
+; F16C-LABEL: test_half_fcopysign:
+; F16C:       # %bb.0:
+; F16C-NEXT:    vpextrw $0, %xmm1, %eax
+; F16C-NEXT:    andl $32768, %eax # imm = 0x8000
+; F16C-NEXT:    vpextrw $0, %xmm0, %ecx
+; F16C-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; F16C-NEXT:    orl %eax, %ecx
+; F16C-NEXT:    movw %cx, (%rdi)
+; F16C-NEXT:    retq
+;
+; FP16-LABEL: test_half_fcopysign:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; FP16-NEXT:    vpternlogd $202, %xmm1, %xmm0, %xmm2
+; FP16-NEXT:    vmovsh %xmm2, (%rdi)
+; FP16-NEXT:    retq
+;
+; X64-LABEL: test_half_fcopysign:
+; X64:       # %bb.0:
+; X64-NEXT:    pextrw $0, %xmm1, %eax
+; X64-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-NEXT:    pextrw $0, %xmm0, %ecx
+; X64-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    movw %cx, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_half_fcopysign:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $32768, %ecx # imm = 0x8000
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $32767, %edx # imm = 0x7FFF
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movw %dx, (%eax)
+; X86-NEXT:    retl
+  %res = call half @llvm.copysign.half(half %a0, half %a1)
+  store half %res, ptr %p0, align 2
+  ret void
+}
+
 define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_pow:
 ; F16C:       # %bb.0:

>From 2c1a3e15104cef8afa2887aedf72e0d76d5915b1 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 27 Aug 2024 00:02:14 +0200
Subject: [PATCH 2/2] [X86][NVPTX][LegalizeDAG] If i16 legal, legalize
 FABS/FNEG/FCOPYSIGN (f16) with Expand

For the concerned targets, `Expand` clears/flips/copies the sign bit
after bitcasting to i16. This is cheaper than `Promote` that requires
converting back and forth from f16 to f32 (with possibly expensive
libcalls) in order to call FABS/FNEG/FCOPYSIGN (f32).
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  3 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  7 +-
 llvm/test/CodeGen/NVPTX/f16-instructions.ll   | 12 ++--
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 19 +++--
 llvm/test/CodeGen/X86/fp16-libcalls.ll        | 70 ++++---------------
 llvm/test/CodeGen/X86/half.ll                 | 12 ++--
 6 files changed, 39 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 73791102fc04de..5b0a87d5f150f8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -849,7 +849,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     AddPromotedToType(Op, MVT::bf16, MVT::f32);
   }
   for (const auto &Op : {ISD::FABS}) {
-    setOperationAction(Op, MVT::f16, Promote);
+    // Expand instead of Promote to clear sign bit by bitcasting to i16
+    setOperationAction(Op, MVT::f16, Expand);
     setOperationAction(Op, MVT::f32, Legal);
     setOperationAction(Op, MVT::f64, Legal);
     setOperationAction(Op, MVT::v2f16, Expand);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1a6be4eb5af1ef..403ea93b62f6b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -601,7 +601,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
     setOperationAction(ISD::FABS, VT, Action);
     setOperationAction(ISD::FNEG, VT, Action);
-    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Action);
     setOperationAction(ISD::FREM, VT, Action);
     setOperationAction(ISD::FMA, VT, Action);
     setOperationAction(ISD::FMINNUM, VT, Action);
@@ -672,6 +672,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // Half type will be promoted by default.
     setF16Action(MVT::f16, Promote);
+    // Expand instead of Promote to clear/flip/copy sign bit by bitcasting to
+    // i16.
+    setOperationAction(ISD::FABS, MVT::f16, Expand);
+    setOperationAction(ISD::FNEG, MVT::f16, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
     setOperationAction(ISD::FADD, MVT::f16, Promote);
     setOperationAction(ISD::FSUB, MVT::f16, Promote);
     setOperationAction(ISD::FMUL, MVT::f16, Promote);
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 14e02a49f6e5e4..cfabf1d6639c2a 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -981,14 +981,10 @@ define half @test_fma(half %a, half %b, half %c) #0 {
 }
 
 ; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fabs_param_0];
-; CHECK-NOFTZ:      cvt.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOFTZ:      abs.f32         [[RF:%f[0-9]+]], [[AF]];
-; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[AF:%f[0-9]+]], [[A]];
-; CHECK-F16-FTZ:      abs.ftz.f32         [[RF:%f[0-9]+]], [[AF]];
-; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
-; CHECK:      st.param.b16    [func_retval0+0], [[R]];
-; CHECK:      ret;
+; CHECK:    ld.param.b16    [[A:%rs[0-9]+]], [test_fabs_param_0];
+; CHECK:    and.b16 [[RF:%rs[0-9]+]], [[A]], 32767;
+; CHECK:    st.param.b16 [func_retval0+0], [[RF]];
+; CHECK:    ret;
 define half @test_fabs(half %a) #0 {
   %r = call half @llvm.fabs.f16(half %a)
   ret half %r
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 464b3a754804fe..abbf9c2a139216 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1182,18 +1182,15 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
   ret <2 x half> %r
 }
 
+; TODO: This should be optimised to use only one AND on the i32 register.
 ; CHECK-LABEL: test_fabs(
-; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
-; CHECK:      mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
-; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%rs[0-9]+]], [[RF0]];
-; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%rs[0-9]+]], [[RF1]];
-; CHECK:      mov.b32         [[R:%r[0-9]+]], {[[R0]], [[R1]]}
-; CHECK:      st.param.b32    [func_retval0+0], [[R]];
-; CHECK:      ret;
+; CHECK:    ld.param.b32    [[A:%r[0-9]+]], [test_fabs_param_0];
+; CHECK:    mov.b32         {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
+; CHECK:    and.b16 [[A2:%rs[0-9]+]], [[A1]], 32767;
+; CHECK:    and.b16 [[A3:%rs[0-9]+]], [[A0]], 32767;
+; CHECK:    mov.b32 [[B:%r[0-9]+]], {[[A3]], [[A2]]};
+; CHECK:    st.param.b32 [func_retval0+0], [[B]];
+; CHECK:    ret;
 define <2 x half> @test_fabs(<2 x half> %a) #0 {
   %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
   ret <2 x half> %r
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index bd1dc47efa41ff..c20e91f46a0a3d 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -124,11 +124,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fabs:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
-; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    andl $32767, %eax # imm = 0x7FFF
 ; F16C-NEXT:    movw %ax, (%rdi)
 ; F16C-NEXT:    retq
 ;
@@ -141,34 +137,17 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ;
 ; X64-LABEL: test_half_fabs:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    callq __truncsfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
-; X64-NEXT:    movw %ax, (%rbx)
-; X64-NEXT:    popq %rbx
+; X64-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    movw %ax, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_half_fabs:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
-; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl
   %res = call half @llvm.fabs.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -179,11 +158,7 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ; F16C-LABEL: test_half_fneg:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vpextrw $0, %xmm0, %eax
-; F16C-NEXT:    vmovd %eax, %xmm0
-; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    xorl $32768, %eax # imm = 0x8000
 ; F16C-NEXT:    movw %ax, (%rdi)
 ; F16C-NEXT:    retq
 ;
@@ -196,34 +171,17 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ;
 ; X64-LABEL: test_half_fneg:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    callq __extendhfsf2 at PLT
-; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT:    callq __truncsfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
-; X64-NEXT:    movw %ax, (%rbx)
-; X64-NEXT:    popq %rbx
+; X64-NEXT:    xorl $32768, %eax # imm = 0x8000
+; X64-NEXT:    movw %ax, (%rdi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_half_fneg:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esp)
-; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    movd %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $32768, %ecx # imm = 0x8000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl
   %res = fneg half %a0
   store half %res, ptr %p0, align 2
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 9f01d07e6a6705..6838925240058e 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1059,7 +1059,6 @@ define void @main.158() #0 {
 ; CHECK-LIBCALL:       # %bb.0: # %entry
 ; CHECK-LIBCALL-NEXT:    pushq %rax
 ; CHECK-LIBCALL-NEXT:    xorps %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT:    callq __truncsfhf2 at PLT
 ; CHECK-LIBCALL-NEXT:    callq __extendhfsf2 at PLT
 ; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-LIBCALL-NEXT:    ucomiss %xmm0, %xmm1
@@ -1077,10 +1076,10 @@ define void @main.158() #0 {
 ; BWON-F16C-LABEL: main.158:
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm2
+; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; BWON-F16C-NEXT:    vucomiss %xmm0, %xmm1
+; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    jae .LBB20_2
 ; BWON-F16C-NEXT:  # %bb.1: # %entry
 ; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
@@ -1093,8 +1092,7 @@ define void @main.158() #0 {
 ; CHECK-I686-LABEL: main.158:
 ; CHECK-I686:       # %bb.0: # %entry
 ; CHECK-I686-NEXT:    subl $12, %esp
-; CHECK-I686-NEXT:    movl $0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
+; CHECK-I686-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2