[llvm] 8fb083d - [X86][FP16] Add constrained FP support for scalar emulation
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 8 05:33:52 PDT 2022
Author: Phoebe Wang
Date: 2022-07-08T20:33:42+08:00
New Revision: 8fb083d33e192240f4a7e692d79a3748e47b65e7
URL: https://github.com/llvm/llvm-project/commit/8fb083d33e192240f4a7e692d79a3748e47b65e7
DIFF: https://github.com/llvm/llvm-project/commit/8fb083d33e192240f4a7e692d79a3748e47b65e7.diff
LOG: [X86][FP16] Add constrained FP support for scalar emulation
This is a follow up patch to support constrained FP in FP16 emulation.
Reviewed By: skan
Differential Revision: https://reviews.llvm.org/D128114
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e00d62798c01b..1c370c6417e1e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -629,6 +629,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, LibCall);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
index dc70de2c57414..db5246d622713 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
@@ -1,8 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-64
define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oeq_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnel %ebx, %ebp
+; SSE2-NEXT: cmovpl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_oeq_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnel %esi, %eax
+; AVX-NEXT: cmovpl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_oeq_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -29,6 +71,43 @@ define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ogt_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovbel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ogt_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ogt_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -53,6 +132,43 @@ define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oge_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovbl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_oge_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_oge_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -77,6 +193,45 @@ define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_olt_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: ucomiss %xmm0, %xmm1
+; SSE2-NEXT: cmovbel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_olt_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_olt_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -101,6 +256,45 @@ define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ole_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: ucomiss %xmm0, %xmm1
+; SSE2-NEXT: cmovbl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ole_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ole_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -125,6 +319,43 @@ define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_one_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_one_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_one_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -149,6 +380,43 @@ define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ord_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovpl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ord_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovpl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ord_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -173,6 +441,43 @@ define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ueq_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ueq_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ueq_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -197,6 +502,45 @@ define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ugt_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: ucomiss %xmm0, %xmm1
+; SSE2-NEXT: cmovael %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ugt_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovael %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ugt_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -221,6 +565,45 @@ define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uge_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: ucomiss %xmm0, %xmm1
+; SSE2-NEXT: cmoval %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_uge_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmoval %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_uge_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -245,6 +628,43 @@ define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ult_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovael %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ult_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovael %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ult_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -269,6 +689,43 @@ define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ule_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmoval %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ule_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmoval %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ule_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -293,6 +750,45 @@ define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_une_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnel %ebp, %ebx
+; SSE2-NEXT: cmovpl %ebp, %ebx
+; SSE2-NEXT: movl %ebx, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_une_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnel %edi, %eax
+; AVX-NEXT: cmovpl %edi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_une_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -319,6 +815,43 @@ define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uno_q:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnpl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_uno_q:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnpl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_uno_q:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -343,6 +876,45 @@ define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oeq_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnel %ebx, %ebp
+; SSE2-NEXT: cmovpl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_oeq_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnel %esi, %eax
+; AVX-NEXT: cmovpl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_oeq_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -369,6 +941,43 @@ define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ogt_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovbel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ogt_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ogt_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -393,6 +1002,43 @@ define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_oge_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovbl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_oge_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_oge_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -417,6 +1063,45 @@ define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_olt_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: comiss %xmm0, %xmm1
+; SSE2-NEXT: cmovbel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_olt_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_olt_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -441,6 +1126,45 @@ define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ole_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: comiss %xmm0, %xmm1
+; SSE2-NEXT: cmovbl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ole_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovbl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ole_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -465,6 +1189,43 @@ define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_one_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_one_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_one_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -489,6 +1250,43 @@ define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ord_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovpl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ord_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovpl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ord_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -513,6 +1311,43 @@ define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ueq_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnel %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ueq_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnel %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ueq_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -537,6 +1372,45 @@ define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ugt_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: comiss %xmm0, %xmm1
+; SSE2-NEXT: cmovael %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ugt_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovael %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ugt_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -561,6 +1435,45 @@ define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uge_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: comiss %xmm0, %xmm1
+; SSE2-NEXT: cmoval %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_uge_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: vpextrw $0, %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmoval %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_uge_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -585,6 +1498,43 @@ define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ult_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovael %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ult_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovael %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ult_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -609,6 +1559,43 @@ define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_ule_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmoval %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_ule_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmoval %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_ule_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -633,6 +1620,45 @@ define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_une_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnel %ebp, %ebx
+; SSE2-NEXT: cmovpl %ebp, %ebx
+; SSE2-NEXT: movl %ebx, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_une_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnel %edi, %eax
+; AVX-NEXT: cmovpl %edi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_une_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -659,6 +1685,43 @@ define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; SSE2-LABEL: test_f16_uno_s:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movl %esi, %ebx
+; SSE2-NEXT: movl %edi, %ebp
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: cmovnpl %ebx, %ebp
+; SSE2-NEXT: movl %ebp, %eax
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_f16_uno_s:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX-NEXT: vpextrw $0, %xmm1, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: cmovnpl %esi, %eax
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: test_f16_uno_s:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -683,6 +1746,42 @@ define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
}
define void @foo(half %0, half %1) #0 {
+; SSE2-LABEL: foo:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: jbe .LBB28_1
+; SSE2-NEXT: # %bb.2:
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: jmp bar at PLT # TAILCALL
+; SSE2-NEXT: .LBB28_1:
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: foo:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: jbe .LBB28_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: jmp bar at PLT # TAILCALL
+; AVX-NEXT: .LBB28_1:
+; AVX-NEXT: retq
+;
; CHECK-32-LABEL: foo:
; CHECK-32: # %bb.0:
; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -713,7 +1812,7 @@ define void @foo(half %0, half %1) #0 {
}
declare void @bar()
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata)
declare i1 @llvm.experimental.constrained.fcmps.f16(half, half, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index a1b6d09d85165..c09af463c9cb5 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -1,4 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
@@ -14,6 +17,39 @@ declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata)
define half @fadd_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fadd_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fadd_f16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: fadd_f16:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -31,6 +67,39 @@ define half @fadd_f16(half %a, half %b) nounwind strictfp {
}
define half @fsub_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fsub_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fsub_f16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: fsub_f16:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -48,6 +117,39 @@ define half @fsub_f16(half %a, half %b) nounwind strictfp {
}
define half @fmul_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fmul_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fmul_f16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: fmul_f16:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -65,6 +167,39 @@ define half @fmul_f16(half %a, half %b) nounwind strictfp {
}
define half @fdiv_f16(half %a, half %b) nounwind strictfp {
+; SSE2-LABEL: fdiv_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: divss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fdiv_f16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX-NEXT: movzwl %cx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: fdiv_f16:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -82,6 +217,24 @@ define half @fdiv_f16(half %a, half %b) nounwind strictfp {
}
define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
+; SSE2-LABEL: fpext_f16_to_f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movq %rsi, %rbx
+; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movd %xmm0, (%rbx)
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fpext_f16_to_f32:
+; AVX: # %bb.0:
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vmovss %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
; X86-LABEL: fpext_f16_to_f32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -105,6 +258,26 @@ define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
}
define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
+; SSE2-LABEL: fpext_f16_to_f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movq %rsi, %rbx
+; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
+; SSE2-NEXT: movsd %xmm0, (%rbx)
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fpext_f16_to_f64:
+; AVX: # %bb.0:
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovsd %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
; X86-LABEL: fpext_f16_to_f64:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -128,6 +301,25 @@ define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
}
define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
+; SSE2-LABEL: fptrunc_float_to_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movq %rsi, %rbx
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rbx)
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptrunc_float_to_f16:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: movw %ax, (%rsi)
+; AVX-NEXT: retq
+;
; X86-LABEL: fptrunc_float_to_f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -152,6 +344,28 @@ define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
}
define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
+; SSE2-LABEL: fptrunc_double_to_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movq %rsi, %rbx
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: callq __truncdfhf2 at PLT
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rbx)
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptrunc_double_to_f16:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: movw %ax, (%rsi)
+; AVX-NEXT: retq
+;
; X86-LABEL: fptrunc_double_to_f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -176,6 +390,32 @@ define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
}
define void @fsqrt_f16(ptr %a) nounwind strictfp {
+; SSE2-LABEL: fsqrt_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movq %rdi, %rbx
+; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: sqrtss %xmm0, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rbx)
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fsqrt_f16:
+; AVX: # %bb.0:
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: movw %ax, (%rdi)
+; AVX-NEXT: retq
+;
; X86-LABEL: fsqrt_f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -199,6 +439,76 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
}
define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
+; SSE2-LABEL: fma_f16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: subq $24, %rsp
+; SSE2-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
+; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: callq fmaf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: addq $24, %rsp
+; SSE2-NEXT: retq
+;
+; F16C-LABEL: fma_f16:
+; F16C: # %bb.0:
+; F16C-NEXT: pushq %rax
+; F16C-NEXT: vpextrw $0, %xmm0, %eax
+; F16C-NEXT: vpextrw $0, %xmm1, %ecx
+; F16C-NEXT: vpextrw $0, %xmm2, %edx
+; F16C-NEXT: movzwl %dx, %edx
+; F16C-NEXT: vmovd %edx, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm2
+; F16C-NEXT: movzwl %cx, %ecx
+; F16C-NEXT: vmovd %ecx, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm1
+; F16C-NEXT: movzwl %ax, %eax
+; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: callq fmaf at PLT
+; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-NEXT: popq %rax
+; F16C-NEXT: retq
+;
+; AVX512-LABEL: fma_f16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrw $0, %xmm1, %eax
+; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512-NEXT: vpextrw $0, %xmm2, %edx
+; AVX512-NEXT: movzwl %dx, %edx
+; AVX512-NEXT: vmovd %edx, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movzwl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
; X86-LABEL: fma_f16:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
index 9ea19ca318816..fac14d8f14e8a 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
@@ -1,4 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
@@ -14,6 +17,25 @@ declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata)
declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata)
define i1 @fptosi_f16toi1(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptosi_f16toi1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptosi_f16toi1:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -31,6 +53,25 @@ define i1 @fptosi_f16toi1(half %x) #0 {
}
define i8 @fptosi_f16toi8(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptosi_f16toi8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptosi_f16toi8:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -48,6 +89,25 @@ define i8 @fptosi_f16toi8(half %x) #0 {
}
define i16 @fptosi_f16toi16(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptosi_f16toi16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptosi_f16toi16:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -65,6 +125,23 @@ define i16 @fptosi_f16toi16(half %x) #0 {
}
define i32 @fptosi_f16toi32(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %eax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptosi_f16toi32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptosi_f16toi32:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -80,6 +157,23 @@ define i32 @fptosi_f16toi32(half %x) #0 {
}
define i64 @fptosi_f16toi64(half %x) #0 {
+; SSE2-LABEL: fptosi_f16toi64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %rax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptosi_f16toi64:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %rax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptosi_f16toi64:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -98,6 +192,25 @@ define i64 @fptosi_f16toi64(half %x) #0 {
}
define i1 @fptoui_f16toi1(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptoui_f16toi1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptoui_f16toi1:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -115,6 +228,25 @@ define i1 @fptoui_f16toi1(half %x) #0 {
}
define i8 @fptoui_f16toi8(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptoui_f16toi8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptoui_f16toi8:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -132,6 +264,25 @@ define i8 @fptoui_f16toi8(half %x) #0 {
}
define i16 @fptoui_f16toi16(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fptoui_f16toi16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX-NEXT: retq
+;
; X86-LABEL: fptoui_f16toi16:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax
@@ -149,6 +300,34 @@ define i16 @fptoui_f16toi16(half %x) #0 {
}
define i32 @fptoui_f16toi32(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: cvttss2si %xmm0, %rax
+; SSE2-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; F16C-LABEL: fptoui_f16toi32:
+; F16C: # %bb.0:
+; F16C-NEXT: vpextrw $0, %xmm0, %eax
+; F16C-NEXT: movzwl %ax, %eax
+; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: vcvttss2si %xmm0, %rax
+; F16C-NEXT: # kill: def $eax killed $eax killed $rax
+; F16C-NEXT: retq
+;
+; AVX512-LABEL: fptoui_f16toi32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvttss2usi %xmm0, %eax
+; AVX512-NEXT: retq
+;
; X86-LABEL: fptoui_f16toi32:
; X86: # %bb.0:
; X86-NEXT: vcvttsh2usi {{[0-9]+}}(%esp), %eax
@@ -164,6 +343,56 @@ define i32 @fptoui_f16toi32(half %x) #0 {
}
define i64 @fptoui_f16toi64(half %x) #0 {
+; SSE2-LABEL: fptoui_f16toi64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: comiss %xmm2, %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: jb .LBB9_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: .LBB9_2:
+; SSE2-NEXT: subss %xmm1, %xmm0
+; SSE2-NEXT: cvttss2si %xmm0, %rcx
+; SSE2-NEXT: setae %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: shlq $63, %rax
+; SSE2-NEXT: xorq %rcx, %rax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; F16C-LABEL: fptoui_f16toi64:
+; F16C: # %bb.0:
+; F16C-NEXT: vpextrw $0, %xmm0, %eax
+; F16C-NEXT: movzwl %ax, %eax
+; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; F16C-NEXT: vcomiss %xmm1, %xmm0
+; F16C-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; F16C-NEXT: jb .LBB9_2
+; F16C-NEXT: # %bb.1:
+; F16C-NEXT: vmovaps %xmm1, %xmm2
+; F16C-NEXT: .LBB9_2:
+; F16C-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; F16C-NEXT: vcvttss2si %xmm0, %rcx
+; F16C-NEXT: setae %al
+; F16C-NEXT: movzbl %al, %eax
+; F16C-NEXT: shlq $63, %rax
+; F16C-NEXT: xorq %rcx, %rax
+; F16C-NEXT: retq
+;
+; AVX512-LABEL: fptoui_f16toi64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvttss2usi %xmm0, %rax
+; AVX512-NEXT: retq
+;
; X86-LABEL: fptoui_f16toi64:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
@@ -181,4 +410,4 @@ define i64 @fptoui_f16toi64(half %x) #0 {
ret i64 %result
}
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
index 58b6068ea53ac..7617aee5e5fbf 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -1,4 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
@@ -14,6 +17,30 @@ declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metada
declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata)
define half @sitofp_i1tof16(i1 %x) #0 {
+; SSE2-LABEL: sitofp_i1tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: andb $1, %dil
+; SSE2-NEXT: negb %dil
+; SSE2-NEXT: movsbl %dil, %eax
+; SSE2-NEXT: cvtsi2ss %eax, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: sitofp_i1tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: andb $1, %dil
+; AVX-NEXT: negb %dil
+; AVX-NEXT: movsbl %dil, %eax
+; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: sitofp_i1tof16:
; X86: # %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
@@ -37,6 +64,26 @@ define half @sitofp_i1tof16(i1 %x) #0 {
}
define half @sitofp_i8tof16(i8 %x) #0 {
+; SSE2-LABEL: sitofp_i8tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movsbl %dil, %eax
+; SSE2-NEXT: cvtsi2ss %eax, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: sitofp_i8tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: movsbl %dil, %eax
+; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: sitofp_i8tof16:
; X86: # %bb.0:
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
@@ -55,6 +102,26 @@ define half @sitofp_i8tof16(i8 %x) #0 {
}
define half @sitofp_i16tof16(i16 %x) #0 {
+; SSE2-LABEL: sitofp_i16tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movswl %di, %eax
+; SSE2-NEXT: cvtsi2ss %eax, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: sitofp_i16tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: movswl %di, %eax
+; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: sitofp_i16tof16:
; X86: # %bb.0:
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
@@ -73,6 +140,24 @@ define half @sitofp_i16tof16(i16 %x) #0 {
}
define half @sitofp_i32tof16(i32 %x) #0 {
+; SSE2-LABEL: sitofp_i32tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: cvtsi2ss %edi, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: sitofp_i32tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: sitofp_i32tof16:
; X86: # %bb.0:
; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -89,6 +174,24 @@ define half @sitofp_i32tof16(i32 %x) #0 {
}
define half @sitofp_i64tof16(i64 %x) #0 {
+; SSE2-LABEL: sitofp_i64tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: cvtsi2ss %rdi, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: sitofp_i64tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: sitofp_i64tof16:
; X86: # %bb.0:
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -106,6 +209,26 @@ define half @sitofp_i64tof16(i64 %x) #0 {
}
define half @uitofp_i1tof16(i1 %x) #0 {
+; SSE2-LABEL: uitofp_i1tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: andl $1, %edi
+; SSE2-NEXT: cvtsi2ss %edi, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: uitofp_i1tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: andl $1, %edi
+; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: uitofp_i1tof16:
; X86: # %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
@@ -126,6 +249,26 @@ define half @uitofp_i1tof16(i1 %x) #0 {
}
define half @uitofp_i8tof16(i8 %x) #0 {
+; SSE2-LABEL: uitofp_i8tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movzbl %dil, %eax
+; SSE2-NEXT: cvtsi2ss %eax, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: uitofp_i8tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: movzbl %dil, %eax
+; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: uitofp_i8tof16:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -144,6 +287,26 @@ define half @uitofp_i8tof16(i8 %x) #0 {
}
define half @uitofp_i16tof16(i16 %x) #0 {
+; SSE2-LABEL: uitofp_i16tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movzwl %di, %eax
+; SSE2-NEXT: cvtsi2ss %eax, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: uitofp_i16tof16:
+; AVX: # %bb.0:
+; AVX-NEXT: movzwl %di, %eax
+; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: uitofp_i16tof16:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
@@ -162,6 +325,36 @@ define half @uitofp_i16tof16(i16 %x) #0 {
}
define half @uitofp_i32tof16(i32 %x) #0 {
+; SSE2-LABEL: uitofp_i32tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movl %edi, %eax
+; SSE2-NEXT: cvtsi2ss %rax, %xmm0
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; F16C-LABEL: uitofp_i32tof16:
+; F16C: # %bb.0:
+; F16C-NEXT: movl %edi, %eax
+; F16C-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
+; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-NEXT: retq
+;
+; AVX512-LABEL: uitofp_i32tof16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
; X86-LABEL: uitofp_i32tof16:
; X86: # %bb.0:
; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -178,6 +371,56 @@ define half @uitofp_i32tof16(i32 %x) #0 {
}
define half @uitofp_i64tof16(i64 %x) #0 {
+; SSE2-LABEL: uitofp_i64tof16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: movl %edi, %ecx
+; SSE2-NEXT: andl $1, %ecx
+; SSE2-NEXT: orq %rax, %rcx
+; SSE2-NEXT: testq %rdi, %rdi
+; SSE2-NEXT: cmovnsq %rdi, %rcx
+; SSE2-NEXT: cvtsi2ss %rcx, %xmm0
+; SSE2-NEXT: jns .LBB9_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: addss %xmm0, %xmm0
+; SSE2-NEXT: .LBB9_2:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; F16C-LABEL: uitofp_i64tof16:
+; F16C: # %bb.0:
+; F16C-NEXT: movq %rdi, %rax
+; F16C-NEXT: shrq %rax
+; F16C-NEXT: movl %edi, %ecx
+; F16C-NEXT: andl $1, %ecx
+; F16C-NEXT: orq %rax, %rcx
+; F16C-NEXT: testq %rdi, %rdi
+; F16C-NEXT: cmovnsq %rdi, %rcx
+; F16C-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0
+; F16C-NEXT: jns .LBB9_2
+; F16C-NEXT: # %bb.1:
+; F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; F16C-NEXT: .LBB9_2:
+; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vmovd %xmm0, %eax
+; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; F16C-NEXT: retq
+;
+; AVX512-LABEL: uitofp_i64tof16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
; X86-LABEL: uitofp_i64tof16:
; X86: # %bb.0:
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -194,4 +437,4 @@ define half @uitofp_i64tof16(i64 %x) #0 {
ret half %result
}
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
index 5d5ebcb278628..3b9798a2af582 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -1,4 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64
@@ -11,6 +14,29 @@ declare half @llvm.experimental.constrained.roundeven.f16(half, metadata)
declare half @llvm.experimental.constrained.round.f16(half, metadata)
define half @fceil32(half %f) #0 {
+; SSE2-LABEL: fceil32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: callq ceilf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fceil32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: fceil32:
; X86: # %bb.0:
; X86-NEXT: vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -26,6 +52,29 @@ define half @fceil32(half %f) #0 {
}
define half @ffloor32(half %f) #0 {
+; SSE2-LABEL: ffloor32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: callq floorf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: ffloor32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: ffloor32:
; X86: # %bb.0:
; X86-NEXT: vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -41,6 +90,29 @@ define half @ffloor32(half %f) #0 {
}
define half @ftrunc32(half %f) #0 {
+; SSE2-LABEL: ftrunc32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: callq truncf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: ftrunc32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: ftrunc32:
; X86: # %bb.0:
; X86-NEXT: vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -56,6 +128,29 @@ define half @ftrunc32(half %f) #0 {
}
define half @frint32(half %f) #0 {
+; SSE2-LABEL: frint32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: callq rintf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: frint32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: frint32:
; X86: # %bb.0:
; X86-NEXT: vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -72,6 +167,29 @@ define half @frint32(half %f) #0 {
}
define half @fnearbyint32(half %f) #0 {
+; SSE2-LABEL: fnearbyint32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: callq nearbyintf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fnearbyint32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: fnearbyint32:
; X86: # %bb.0:
; X86-NEXT: vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -88,6 +206,29 @@ define half @fnearbyint32(half %f) #0 {
}
define half @froundeven16(half %f) #0 {
+; SSE2-LABEL: froundeven16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: callq roundevenf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: froundeven16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
; X86-LABEL: froundeven16:
; X86: # %bb.0:
; X86-NEXT: vrndscalesh $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -104,10 +245,34 @@ define half @froundeven16(half %f) #0 {
}
define half @fround16(half %f) #0 {
+; SSE2-LABEL: fround16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: callq roundf at PLT
+; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: popq %rax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: fround16:
+; AVX: # %bb.0:
+; AVX-NEXT: pushq %rax
+; AVX-NEXT: vpextrw $0, %xmm0, %eax
+; AVX-NEXT: movzwl %ax, %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: callq roundf at PLT
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: popq %rax
+; AVX-NEXT: retq
+;
; X86-LABEL: fround16:
; X86: # %bb.0:
; X86-NEXT: subl $8, %esp
-; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -117,18 +282,15 @@ define half @fround16(half %f) #0 {
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: fround16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
-; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
; X64-NEXT: callq roundf at PLT
; X64-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
; X64-NEXT: popq %rax
-; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%res = call half @llvm.experimental.constrained.round.f16(
@@ -136,4 +298,4 @@ define half @fround16(half %f) #0 {
ret half %res
}
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
More information about the llvm-commits
mailing list