[llvm] 8fb083d - [X86][FP16] Add constrained FP support for scalar emulation

Phoebe Wang via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 8 05:33:52 PDT 2022


Author: Phoebe Wang
Date: 2022-07-08T20:33:42+08:00
New Revision: 8fb083d33e192240f4a7e692d79a3748e47b65e7

URL: https://github.com/llvm/llvm-project/commit/8fb083d33e192240f4a7e692d79a3748e47b65e7
DIFF: https://github.com/llvm/llvm-project/commit/8fb083d33e192240f4a7e692d79a3748e47b65e7.diff

LOG: [X86][FP16] Add constrained FP support for scalar emulation

This is a follow up patch to support constrained FP in FP16 emulation.

Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D128114

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e00d62798c01b..1c370c6417e1e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -629,6 +629,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
     setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
     setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+
+    setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, LibCall);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
 
     setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
index dc70de2c57414..db5246d622713 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
@@ -1,8 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2  -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c  -O3 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  -O3 | FileCheck %s --check-prefixes=AVX
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-64
 
 define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oeq_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnel %ebx, %ebp
+; SSE2-NEXT:    cmovpl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_oeq_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnel %esi, %eax
+; AVX-NEXT:    cmovpl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_oeq_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -29,6 +71,43 @@ define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ogt_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovbel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ogt_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ogt_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -53,6 +132,43 @@ define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oge_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovbl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_oge_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_oge_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -77,6 +193,45 @@ define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_olt_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    ucomiss %xmm0, %xmm1
+; SSE2-NEXT:    cmovbel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_olt_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_olt_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -101,6 +256,45 @@ define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ole_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    ucomiss %xmm0, %xmm1
+; SSE2-NEXT:    cmovbl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ole_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ole_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -125,6 +319,43 @@ define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_one_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_one_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_one_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -149,6 +380,43 @@ define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ord_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovpl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ord_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovpl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ord_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -173,6 +441,43 @@ define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ueq_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ueq_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ueq_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -197,6 +502,45 @@ define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ugt_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    ucomiss %xmm0, %xmm1
+; SSE2-NEXT:    cmovael %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ugt_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovael %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ugt_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -221,6 +565,45 @@ define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uge_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    ucomiss %xmm0, %xmm1
+; SSE2-NEXT:    cmoval %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_uge_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmoval %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_uge_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -245,6 +628,43 @@ define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ult_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovael %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ult_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovael %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ult_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -269,6 +689,43 @@ define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ule_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmoval %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ule_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmoval %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ule_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -293,6 +750,45 @@ define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_une_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnel %ebp, %ebx
+; SSE2-NEXT:    cmovpl %ebp, %ebx
+; SSE2-NEXT:    movl %ebx, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_une_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnel %edi, %eax
+; AVX-NEXT:    cmovpl %edi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_une_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -319,6 +815,43 @@ define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uno_q:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnpl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_uno_q:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnpl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_uno_q:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -343,6 +876,45 @@ define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oeq_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnel %ebx, %ebp
+; SSE2-NEXT:    cmovpl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_oeq_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnel %esi, %eax
+; AVX-NEXT:    cmovpl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_oeq_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -369,6 +941,43 @@ define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ogt_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovbel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ogt_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ogt_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -393,6 +1002,43 @@ define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oge_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovbl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_oge_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_oge_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -417,6 +1063,45 @@ define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_olt_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    comiss %xmm0, %xmm1
+; SSE2-NEXT:    cmovbel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_olt_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_olt_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -441,6 +1126,45 @@ define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ole_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    comiss %xmm0, %xmm1
+; SSE2-NEXT:    cmovbl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ole_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovbl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ole_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -465,6 +1189,43 @@ define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_one_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_one_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_one_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -489,6 +1250,43 @@ define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ord_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovpl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ord_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovpl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ord_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -513,6 +1311,43 @@ define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ueq_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnel %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ueq_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnel %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ueq_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -537,6 +1372,45 @@ define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ugt_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    comiss %xmm0, %xmm1
+; SSE2-NEXT:    cmovael %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ugt_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovael %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ugt_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -561,6 +1435,45 @@ define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uge_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    comiss %xmm0, %xmm1
+; SSE2-NEXT:    cmoval %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_uge_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmoval %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_uge_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -585,6 +1498,43 @@ define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ult_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovael %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ult_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovael %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ult_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -609,6 +1559,43 @@ define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ule_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmoval %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_ule_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmoval %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_ule_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -633,6 +1620,45 @@ define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_une_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnel %ebp, %ebx
+; SSE2-NEXT:    cmovpl %ebp, %ebx
+; SSE2-NEXT:    movl %ebx, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_une_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnel %edi, %eax
+; AVX-NEXT:    cmovpl %edi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_une_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -659,6 +1685,43 @@ define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uno_s:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movl %esi, %ebx
+; SSE2-NEXT:    movl %edi, %ebp
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    cmovnpl %ebx, %ebp
+; SSE2-NEXT:    movl %ebp, %eax
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_f16_uno_s:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX-NEXT:    vpextrw $0, %xmm1, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    cmovnpl %esi, %eax
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: test_f16_uno_s:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -683,6 +1746,42 @@ define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 }
 
 define void @foo(half %0, half %1) #0 {
+; SSE2-LABEL: foo:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    jbe .LBB28_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    jmp bar at PLT # TAILCALL
+; SSE2-NEXT:  .LBB28_1:
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: foo:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    jbe .LBB28_1
+; AVX-NEXT:  # %bb.2:
+; AVX-NEXT:    jmp bar at PLT # TAILCALL
+; AVX-NEXT:  .LBB28_1:
+; AVX-NEXT:    retq
+;
 ; CHECK-32-LABEL: foo:
 ; CHECK-32:       # %bb.0:
 ; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -713,7 +1812,7 @@ define void @foo(half %0, half %1) #0 {
 }
 declare void @bar()
 
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
 
 declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata)
 declare i1 @llvm.experimental.constrained.fcmps.f16(half, half, metadata, metadata)

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index a1b6d09d85165..c09af463c9cb5 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2  -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c  -O3 | FileCheck %s --check-prefixes=AVX,F16C
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  -O3 | FileCheck %s --check-prefixes=AVX,AVX512
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
 
@@ -14,6 +17,39 @@ declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata)
 
 define half @fadd_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fadd_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fadd_f16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fadd_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -31,6 +67,39 @@ define half @fadd_f16(half %a, half %b) nounwind strictfp {
 }
 
 define half @fsub_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fsub_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fsub_f16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fsub_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -48,6 +117,39 @@ define half @fsub_f16(half %a, half %b) nounwind strictfp {
 }
 
 define half @fmul_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fmul_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    mulss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fmul_f16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fmul_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -65,6 +167,39 @@ define half @fmul_f16(half %a, half %b) nounwind strictfp {
 }
 
 define half @fdiv_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fdiv_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    divss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fdiv_f16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    vpextrw $0, %xmm1, %ecx
+; AVX-NEXT:    movzwl %cx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fdiv_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -82,6 +217,24 @@ define half @fdiv_f16(half %a, half %b) nounwind strictfp {
 }
 
 define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
+; SSE2-LABEL: fpext_f16_to_f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq %rsi, %rbx
+; SSE2-NEXT:    pinsrw $0, (%rdi), %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movd %xmm0, (%rbx)
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fpext_f16_to_f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vmovss %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fpext_f16_to_f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -105,6 +258,26 @@ define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
 }
 
 define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
+; SSE2-LABEL: fpext_f16_to_f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq %rsi, %rbx
+; SSE2-NEXT:    pinsrw $0, (%rdi), %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm0
+; SSE2-NEXT:    movsd %xmm0, (%rbx)
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fpext_f16_to_f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fpext_f16_to_f64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -128,6 +301,25 @@ define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
 }
 
 define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
+; SSE2-LABEL: fptrunc_float_to_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq %rsi, %rbx
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rbx)
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptrunc_float_to_f16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, (%rsi)
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptrunc_float_to_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -152,6 +344,28 @@ define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
 }
 
 define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
+; SSE2-LABEL: fptrunc_double_to_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq %rsi, %rbx
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    callq __truncdfhf2 at PLT
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rbx)
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptrunc_double_to_f16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, (%rsi)
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptrunc_double_to_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -176,6 +390,32 @@ define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
 }
 
 define void @fsqrt_f16(ptr %a) nounwind strictfp {
+; SSE2-LABEL: fsqrt_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq %rdi, %rbx
+; SSE2-NEXT:    pinsrw $0, (%rdi), %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    sqrtss %xmm0, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movw %ax, (%rbx)
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fsqrt_f16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl (%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, (%rdi)
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fsqrt_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -199,6 +439,76 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
 }
 
 define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
+; SSE2-LABEL: fma_f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    subq $24, %rsp
+; SSE2-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
+; SSE2-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    callq fmaf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    addq $24, %rsp
+; SSE2-NEXT:    retq
+;
+; F16C-LABEL: fma_f16:
+; F16C:       # %bb.0:
+; F16C-NEXT:    pushq %rax
+; F16C-NEXT:    vpextrw $0, %xmm0, %eax
+; F16C-NEXT:    vpextrw $0, %xmm1, %ecx
+; F16C-NEXT:    vpextrw $0, %xmm2, %edx
+; F16C-NEXT:    movzwl %dx, %edx
+; F16C-NEXT:    vmovd %edx, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm2
+; F16C-NEXT:    movzwl %cx, %ecx
+; F16C-NEXT:    vmovd %ecx, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm1
+; F16C-NEXT:    movzwl %ax, %eax
+; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    callq fmaf at PLT
+; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-NEXT:    popq %rax
+; F16C-NEXT:    retq
+;
+; AVX512-LABEL: fma_f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX512-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX512-NEXT:    vpextrw $0, %xmm2, %edx
+; AVX512-NEXT:    movzwl %dx, %edx
+; AVX512-NEXT:    vmovd %edx, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    movzwl %cx, %ecx
+; AVX512-NEXT:    vmovd %ecx, %xmm1
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    vmovd %eax, %xmm2
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
 ; X86-LABEL: fma_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
index 9ea19ca318816..fac14d8f14e8a 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2  -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c  -O3 | FileCheck %s --check-prefixes=AVX,F16C
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  -O3 | FileCheck %s --check-prefixes=AVX,AVX512
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
 
@@ -14,6 +17,25 @@ declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata)
 declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata)
 
 define i1 @fptosi_f16toi1(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptosi_f16toi1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptosi_f16toi1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -31,6 +53,25 @@ define i1 @fptosi_f16toi1(half %x) #0 {
 }
 
 define i8 @fptosi_f16toi8(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptosi_f16toi8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptosi_f16toi8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -48,6 +89,25 @@ define i8 @fptosi_f16toi8(half %x) #0 {
 }
 
 define i16 @fptosi_f16toi16(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptosi_f16toi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptosi_f16toi16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -65,6 +125,23 @@ define i16 @fptosi_f16toi16(half %x) #0 {
 }
 
 define i32 @fptosi_f16toi32(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %eax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptosi_f16toi32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptosi_f16toi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -80,6 +157,23 @@ define i32 @fptosi_f16toi32(half %x) #0 {
 }
 
 define i64 @fptosi_f16toi64(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %rax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptosi_f16toi64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %rax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptosi_f16toi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -98,6 +192,25 @@ define i64 @fptosi_f16toi64(half %x) #0 {
 }
 
 define i1 @fptoui_f16toi1(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptoui_f16toi1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptoui_f16toi1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -115,6 +228,25 @@ define i1 @fptoui_f16toi1(half %x) #0 {
 }
 
 define i8 @fptoui_f16toi8(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %eax
+; SSE2-NEXT:    # kill: def $al killed $al killed $eax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptoui_f16toi8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptoui_f16toi8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -132,6 +264,25 @@ define i8 @fptoui_f16toi8(half %x) #0 {
 }
 
 define i16 @fptoui_f16toi16(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fptoui_f16toi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fptoui_f16toi16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -149,6 +300,34 @@ define i16 @fptoui_f16toi16(half %x) #0 {
 }
 
 define i32 @fptoui_f16toi32(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    cvttss2si %xmm0, %rax
+; SSE2-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; F16C-LABEL: fptoui_f16toi32:
+; F16C:       # %bb.0:
+; F16C-NEXT:    vpextrw $0, %xmm0, %eax
+; F16C-NEXT:    movzwl %ax, %eax
+; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vcvttss2si %xmm0, %rax
+; F16C-NEXT:    # kill: def $eax killed $eax killed $rax
+; F16C-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_f16toi32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vcvttss2usi %xmm0, %eax
+; AVX512-NEXT:    retq
+;
 ; X86-LABEL: fptoui_f16toi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvttsh2usi {{[0-9]+}}(%esp), %eax
@@ -164,6 +343,56 @@ define i32 @fptoui_f16toi32(half %x) #0 {
 }
 
 define i64 @fptoui_f16toi64(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    comiss %xmm2, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    jb .LBB9_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:  .LBB9_2:
+; SSE2-NEXT:    subss %xmm1, %xmm0
+; SSE2-NEXT:    cvttss2si %xmm0, %rcx
+; SSE2-NEXT:    setae %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    shlq $63, %rax
+; SSE2-NEXT:    xorq %rcx, %rax
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; F16C-LABEL: fptoui_f16toi64:
+; F16C:       # %bb.0:
+; F16C-NEXT:    vpextrw $0, %xmm0, %eax
+; F16C-NEXT:    movzwl %ax, %eax
+; F16C-NEXT:    vmovd %eax, %xmm0
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; F16C-NEXT:    vcomiss %xmm1, %xmm0
+; F16C-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; F16C-NEXT:    jb .LBB9_2
+; F16C-NEXT:  # %bb.1:
+; F16C-NEXT:    vmovaps %xmm1, %xmm2
+; F16C-NEXT:  .LBB9_2:
+; F16C-NEXT:    vsubss %xmm2, %xmm0, %xmm0
+; F16C-NEXT:    vcvttss2si %xmm0, %rcx
+; F16C-NEXT:    setae %al
+; F16C-NEXT:    movzbl %al, %eax
+; F16C-NEXT:    shlq $63, %rax
+; F16C-NEXT:    xorq %rcx, %rax
+; F16C-NEXT:    retq
+;
+; AVX512-LABEL: fptoui_f16toi64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512-NEXT:    retq
+;
 ; X86-LABEL: fptoui_f16toi64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -181,4 +410,4 @@ define i64 @fptoui_f16toi64(half %x) #0 {
   ret i64 %result
 }
 
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
index 58b6068ea53ac..7617aee5e5fbf 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2  -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c  -O3 | FileCheck %s --check-prefixes=AVX,F16C
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  -O3 | FileCheck %s --check-prefixes=AVX,AVX512
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
 
@@ -14,6 +17,30 @@ declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metada
 declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata)
 
 define half @sitofp_i1tof16(i1 %x) #0 {
+; SSE2-LABEL: sitofp_i1tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    andb $1, %dil
+; SSE2-NEXT:    negb %dil
+; SSE2-NEXT:    movsbl %dil, %eax
+; SSE2-NEXT:    cvtsi2ss %eax, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i1tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    andb $1, %dil
+; AVX-NEXT:    negb %dil
+; AVX-NEXT:    movsbl %dil, %eax
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: sitofp_i1tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
@@ -37,6 +64,26 @@ define half @sitofp_i1tof16(i1 %x) #0 {
 }
 
 define half @sitofp_i8tof16(i8 %x) #0 {
+; SSE2-LABEL: sitofp_i8tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movsbl %dil, %eax
+; SSE2-NEXT:    cvtsi2ss %eax, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i8tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movsbl %dil, %eax
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: sitofp_i8tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
@@ -55,6 +102,26 @@ define half @sitofp_i8tof16(i8 %x) #0 {
 }
 
 define half @sitofp_i16tof16(i16 %x) #0 {
+; SSE2-LABEL: sitofp_i16tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movswl %di, %eax
+; SSE2-NEXT:    cvtsi2ss %eax, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i16tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movswl %di, %eax
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: sitofp_i16tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
@@ -73,6 +140,24 @@ define half @sitofp_i16tof16(i16 %x) #0 {
 }
 
 define half @sitofp_i32tof16(i32 %x) #0 {
+; SSE2-LABEL: sitofp_i32tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    cvtsi2ss %edi, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i32tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: sitofp_i32tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -89,6 +174,24 @@ define half @sitofp_i32tof16(i32 %x) #0 {
 }
 
 define half @sitofp_i64tof16(i64 %x) #0 {
+; SSE2-LABEL: sitofp_i64tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    cvtsi2ss %rdi, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i64tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: sitofp_i64tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -106,6 +209,26 @@ define half @sitofp_i64tof16(i64 %x) #0 {
 }
 
 define half @uitofp_i1tof16(i1 %x) #0 {
+; SSE2-LABEL: uitofp_i1tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    andl $1, %edi
+; SSE2-NEXT:    cvtsi2ss %edi, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: uitofp_i1tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    andl $1, %edi
+; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: uitofp_i1tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
@@ -126,6 +249,26 @@ define half @uitofp_i1tof16(i1 %x) #0 {
 }
 
 define half @uitofp_i8tof16(i8 %x) #0 {
+; SSE2-LABEL: uitofp_i8tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    cvtsi2ss %eax, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: uitofp_i8tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzbl %dil, %eax
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: uitofp_i8tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -144,6 +287,26 @@ define half @uitofp_i8tof16(i8 %x) #0 {
 }
 
 define half @uitofp_i16tof16(i16 %x) #0 {
+; SSE2-LABEL: uitofp_i16tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movzwl %di, %eax
+; SSE2-NEXT:    cvtsi2ss %eax, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: uitofp_i16tof16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: uitofp_i16tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -162,6 +325,36 @@ define half @uitofp_i16tof16(i16 %x) #0 {
 }
 
 define half @uitofp_i32tof16(i32 %x) #0 {
+; SSE2-LABEL: uitofp_i32tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movl %edi, %eax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; F16C-LABEL: uitofp_i32tof16:
+; F16C:       # %bb.0:
+; F16C-NEXT:    movl %edi, %eax
+; F16C-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_i32tof16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
 ; X86-LABEL: uitofp_i32tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -178,6 +371,56 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 }
 
 define half @uitofp_i64tof16(i64 %x) #0 {
+; SSE2-LABEL: uitofp_i64tof16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    shrq %rax
+; SSE2-NEXT:    movl %edi, %ecx
+; SSE2-NEXT:    andl $1, %ecx
+; SSE2-NEXT:    orq %rax, %rcx
+; SSE2-NEXT:    testq %rdi, %rdi
+; SSE2-NEXT:    cmovnsq %rdi, %rcx
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm0
+; SSE2-NEXT:    jns .LBB9_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:  .LBB9_2:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; F16C-LABEL: uitofp_i64tof16:
+; F16C:       # %bb.0:
+; F16C-NEXT:    movq %rdi, %rax
+; F16C-NEXT:    shrq %rax
+; F16C-NEXT:    movl %edi, %ecx
+; F16C-NEXT:    andl $1, %ecx
+; F16C-NEXT:    orq %rax, %rcx
+; F16C-NEXT:    testq %rdi, %rdi
+; F16C-NEXT:    cmovnsq %rdi, %rcx
+; F16C-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; F16C-NEXT:    jns .LBB9_2
+; F16C-NEXT:  # %bb.1:
+; F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; F16C-NEXT:  .LBB9_2:
+; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    vmovd %xmm0, %eax
+; F16C-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-NEXT:    retq
+;
+; AVX512-LABEL: uitofp_i64tof16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
 ; X86-LABEL: uitofp_i64tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -194,4 +437,4 @@ define half @uitofp_i64tof16(i64 %x) #0 {
   ret half %result
 }
 
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
index 5d5ebcb278628..3b9798a2af582 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2  -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c  -O3 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  -O3 | FileCheck %s --check-prefixes=AVX
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64
 
@@ -11,6 +14,29 @@ declare half @llvm.experimental.constrained.roundeven.f16(half, metadata)
 declare half @llvm.experimental.constrained.round.f16(half, metadata)
 
 define half @fceil32(half %f) #0 {
+; SSE2-LABEL: fceil32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    callq ceilf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fceil32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fceil32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -26,6 +52,29 @@ define half @fceil32(half %f) #0 {
 }
 
 define half @ffloor32(half %f) #0 {
+; SSE2-LABEL: ffloor32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    callq floorf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: ffloor32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: ffloor32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -41,6 +90,29 @@ define half @ffloor32(half %f) #0 {
 }
 
 define half @ftrunc32(half %f) #0 {
+; SSE2-LABEL: ftrunc32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    callq truncf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: ftrunc32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: ftrunc32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -56,6 +128,29 @@ define half @ftrunc32(half %f) #0 {
 }
 
 define half @frint32(half %f) #0 {
+; SSE2-LABEL: frint32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    callq rintf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: frint32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: frint32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -72,6 +167,29 @@ define half @frint32(half %f) #0 {
 }
 
 define half @fnearbyint32(half %f) #0 {
+; SSE2-LABEL: fnearbyint32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    callq nearbyintf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fnearbyint32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fnearbyint32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -88,6 +206,29 @@ define half @fnearbyint32(half %f) #0 {
 }
 
 define half @froundeven16(half %f) #0 {
+; SSE2-LABEL: froundeven16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    callq roundevenf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: froundeven16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vroundss $8, %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: froundeven16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vrndscalesh $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -104,10 +245,34 @@ define half @froundeven16(half %f) #0 {
 }
 
 define half @fround16(half %f) #0 {
+; SSE2-LABEL: fround16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2 at PLT
+; SSE2-NEXT:    callq roundf at PLT
+; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: fround16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX-NEXT:    movzwl %ax, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    callq roundf at PLT
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    retq
+;
 ; X86-LABEL: fround16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
@@ -117,18 +282,15 @@ define half @fround16(half %f) #0 {
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fround16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    callq roundf at PLT
 ; X64-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    popq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 
   %res = call half @llvm.experimental.constrained.round.f16(
@@ -136,4 +298,4 @@ define half @fround16(half %f) #0 {
   ret half %res
 }
 
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }


        


More information about the llvm-commits mailing list