[llvm] [X86] Combine `uitofp <v x i32> to <v x half>` (PR #121809)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 7 05:59:55 PST 2025
https://github.com/abhishek-kaushik22 updated https://github.com/llvm/llvm-project/pull/121809
>From 9f62f4105035091b57c6912a22f3e0d7f72bdf2b Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 03:19:13 +0530
Subject: [PATCH 01/10] Update LegalizeVectorOps.cpp
---
.../SelectionDAG/LegalizeVectorOps.cpp | 21 +++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 154c8aea6bcd17..2be9239202b021 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1776,6 +1776,27 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
assert((BW == 64 || BW == 32) &&
"Elements in vector-UINT_TO_FP must be 32 or 64 bits wide");
+ // If STRICT_/FMUL is not supported by the target (in case of f16) replace the
+ // UINT_TO_FP with a larger float and round to the smaller type
+ if ((!IsStrict && TLI.getOperationAction(ISD::FMUL, Node->getValueType(0)) ==
+ TargetLowering::Expand) ||
+ (IsStrict &&
+ TLI.getOperationAction(ISD::STRICT_FMUL, Node->getValueType(0)) ==
+ TargetLowering::Expand)) {
+ EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64;
+ SDLoc DL(Node);
+ unsigned Round = IsStrict ? ISD::STRICT_FP_ROUND : ISD::FP_ROUND;
+ unsigned UIToFP = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP;
+ SDValue Result = DAG.getNode(
+ Round, DL, Node->getValueType(0),
+ DAG.getNode(UIToFP, DL, VT.changeVectorElementType(FPVT), Src),
+ DAG.getTargetConstant(
+ 0, DL,
+ DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())));
+ Results.push_back(Result);
+ return;
+ }
+
SDValue HalfWord = DAG.getConstant(BW / 2, DL, VT);
// Constants to clear the upper part of the word.
>From 74f53279c202d08e0442ae3681904d583209264a Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 03:38:22 +0530
Subject: [PATCH 02/10] Update LegalizeVectorOps.cpp
---
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 966c9fff5750d3..17605eb5034685 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1779,18 +1779,17 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
// If STRICT_/FMUL is not supported by the target (in case of f16) replace the
// UINT_TO_FP with a larger float and round to the smaller type
- if ((!IsStrict && TLI.getOperationAction(ISD::FMUL, Node->getValueType(0)) ==
- TargetLowering::Expand) ||
- (IsStrict &&
- TLI.getOperationAction(ISD::STRICT_FMUL, Node->getValueType(0)) ==
- TargetLowering::Expand)) {
+ if ((!IsStrict &&
+ TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand) ||
+ (IsStrict && TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) ==
+ TargetLowering::Expand)) {
EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64;
SDLoc DL(Node);
unsigned Round = IsStrict ? ISD::STRICT_FP_ROUND : ISD::FP_ROUND;
unsigned UIToFP = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP;
SDValue Result = DAG.getNode(
- Round, DL, Node->getValueType(0),
- DAG.getNode(UIToFP, DL, VT.changeVectorElementType(FPVT), Src),
+ Round, DL, DstVT,
+ DAG.getNode(UIToFP, DL, SrcVT.changeVectorElementType(FPVT), Src),
DAG.getTargetConstant(
0, DL,
DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())));
>From 40ca7cf0f200e977906d198c176d2f16e76cb8e3 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 13:38:23 +0530
Subject: [PATCH 03/10] Update LegalizeVectorOps.cpp
---
.../SelectionDAG/LegalizeVectorOps.cpp | 28 ++++++++++++-------
1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 17605eb5034685..33621201fe1ef3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1784,16 +1784,24 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
(IsStrict && TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) ==
TargetLowering::Expand)) {
EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64;
- SDLoc DL(Node);
- unsigned Round = IsStrict ? ISD::STRICT_FP_ROUND : ISD::FP_ROUND;
- unsigned UIToFP = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP;
- SDValue Result = DAG.getNode(
- Round, DL, DstVT,
- DAG.getNode(UIToFP, DL, SrcVT.changeVectorElementType(FPVT), Src),
- DAG.getTargetConstant(
- 0, DL,
- DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())));
- Results.push_back(Result);
+ SDValue UIToFP;
+ SDValue Result;
+ SDValue TargetZero = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+ if (IsStrict) {
+ UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL,
+ {SrcVT.changeVectorElementType(FPVT), MVT::Other},
+ {Node->getOperand(0), Src});
+ Result = DAG.getNode(ISD::STRICT_FP_ROUND, DL, {DstVT, MVT::Other},
+ {Node->getOperand(0), UIToFP, TargetZero});
+ Results.push_back(Result);
+ Results.push_back(Result.getValue(1));
+ } else {
+ UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL,
+ SrcVT.changeVectorElementType(FPVT), Src);
+ Result = DAG.getNode(ISD::FP_ROUND, DL, DstVT, UIToFP, TargetZero);
+ Results.push_back(Result);
+ }
+
return;
}
>From 9a519bc9ac6a09c37485f30d5009e5b98bfb0636 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 13:58:57 +0530
Subject: [PATCH 04/10] Update LegalizeVectorOps.cpp
---
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 33621201fe1ef3..1cbee72cfc7ce4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1787,17 +1787,16 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
SDValue UIToFP;
SDValue Result;
SDValue TargetZero = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+ EVT FloatVecVT = SrcVT.changeVectorElementType(FPVT);
if (IsStrict) {
- UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL,
- {SrcVT.changeVectorElementType(FPVT), MVT::Other},
+ UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {FloatVecVT, MVT::Other},
{Node->getOperand(0), Src});
Result = DAG.getNode(ISD::STRICT_FP_ROUND, DL, {DstVT, MVT::Other},
{Node->getOperand(0), UIToFP, TargetZero});
Results.push_back(Result);
Results.push_back(Result.getValue(1));
} else {
- UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL,
- SrcVT.changeVectorElementType(FPVT), Src);
+ UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVecVT, Src);
Result = DAG.getNode(ISD::FP_ROUND, DL, DstVT, UIToFP, TargetZero);
Results.push_back(Result);
}
>From eb53636a01b4534c244c07bbf6a27ed890fdf5b2 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 14:18:16 +0530
Subject: [PATCH 05/10] Add test with auto assertions
---
.../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 404 ++++++++++++++++++
1 file changed, 404 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
new file mode 100644
index 00000000000000..07296068d51825
--- /dev/null
+++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
@@ -0,0 +1,404 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx | FileCheck %s
+
+define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) {
+; CHECK-LABEL: test_UINT_TO_FP_no_inf8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $88, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
+; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: addq $88, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %vec = uitofp <8 x i32> %a to <8 x half>
+ ret <8 x half> %vec
+}
+
+define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) {
+; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $88, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
+; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: addq $88, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f34.i34(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <8 x half> %vec
+}
+
+define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) {
+; CHECK-LABEL: test_UINT_TO_FP_no_inf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $120, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1
+; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
+; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: addq $120, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %vec = uitofp <16 x i32> %a to <16 x half>
+ ret <16 x half> %vec
+}
+
+define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) {
+; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $120, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1
+; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
+; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,0]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: addq $120, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f34.i34(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <16 x half> %vec
+}
>From cab2613d61515405314ad0b5544a544caf721309 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 14:38:32 +0530
Subject: [PATCH 06/10] Update test
---
.../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 789 +++++++++---------
1 file changed, 408 insertions(+), 381 deletions(-)
diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
index 07296068d51825..90ec157359536b 100644
--- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
+++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
@@ -1,404 +1,431 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefix=AVX512
define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) {
-; CHECK-LABEL: test_UINT_TO_FP_no_inf8:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: subq $88, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 96
-; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
-; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: addq $88, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
+; AVX-LABEL: test_UINT_TO_FP_no_inf8:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $88, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 96
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: addq $88, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_UINT_TO_FP_no_inf8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
entry:
%vec = uitofp <8 x i32> %a to <8 x half>
ret <8 x half> %vec
}
define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) {
-; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: subq $88, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 96
-; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
-; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: addq $88, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
+; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $88, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 96
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: addq $88, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
entry:
- %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f34.i34(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
ret <8 x half> %vec
}
define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) {
-; CHECK-LABEL: test_UINT_TO_FP_no_inf16:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: subq $120, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 128
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1
-; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
-; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: addq $120, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
+; AVX-LABEL: test_UINT_TO_FP_no_inf16:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $120, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 128
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm0
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm0
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
+; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT: addq $120, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_UINT_TO_FP_no_inf16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT: retq
entry:
%vec = uitofp <16 x i32> %a to <16 x half>
ret <16 x half> %vec
}
define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) {
-; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: subq $120, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 128
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1
-; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
-; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,0]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
-; CHECK-NEXT: callq __truncsfhf2 at PLT
-; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT: addq $120, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
+; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: subq $120, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 128
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm0
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm0
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
+; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[3,3,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,0]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq __truncsfhf2 at PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT: addq $120, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT: retq
entry:
- %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f34.i34(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
ret <16 x half> %vec
}
>From 96707b1886ca2a675087293f09af18f498f2cf84 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 14:40:56 +0530
Subject: [PATCH 07/10] Fix indentation
---
.../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
index 90ec157359536b..43517b3871cf86 100644
--- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
+++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
@@ -75,8 +75,8 @@ define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
- %vec = uitofp <8 x i32> %a to <8 x half>
- ret <8 x half> %vec
+ %vec = uitofp <8 x i32> %a to <8 x half>
+ ret <8 x half> %vec
}
define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) {
@@ -152,8 +152,8 @@ define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
- %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret <8 x half> %vec
+ %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <8 x half> %vec
}
define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) {
@@ -289,8 +289,8 @@ define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) {
; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512-NEXT: retq
entry:
- %vec = uitofp <16 x i32> %a to <16 x half>
- ret <16 x half> %vec
+ %vec = uitofp <16 x i32> %a to <16 x half>
+ ret <16 x half> %vec
}
define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) {
@@ -426,6 +426,6 @@ define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) {
; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512-NEXT: retq
entry:
- %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret <16 x half> %vec
+ %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <16 x half> %vec
}
>From d7809f2e04db758093380802cf3ba17c8aed8b23 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 16:07:11 +0530
Subject: [PATCH 08/10] Update test with `+f16c`
---
.../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 402 +++---------------
1 file changed, 61 insertions(+), 341 deletions(-)
diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
index 43517b3871cf86..74d2e042c10901 100644
--- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
+++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
@@ -1,71 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefix=AVX
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefix=AVX512
define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) {
; AVX-LABEL: test_UINT_TO_FP_no_inf8:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: subq $88, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 96
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: addq $88, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
;
; AVX512-LABEL: test_UINT_TO_FP_no_inf8:
@@ -82,8 +38,6 @@ entry:
define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) {
; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: subq $88, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 96
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
@@ -93,56 +47,8 @@ define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) {
; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: addq $88, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
;
; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
@@ -159,128 +65,41 @@ entry:
define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) {
; AVX-LABEL: test_UINT_TO_FP_no_inf16:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: subq $120, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 128
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm3
+; AVX-NEXT: vcvtdq2ps %ymm3, %ymm3
+; AVX-NEXT: vcvtps2ph $4, %ymm3, %xmm3
+; AVX-NEXT: vcvtph2ps %xmm3, %ymm3
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm4
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vpsrld $16, %xmm2, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
+; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
+; AVX-NEXT: vmulps %ymm4, %ymm0, %ymm0
+; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX-NEXT: vaddps %ymm3, %ymm0, %ymm0
+; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm2
+; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX-NEXT: vcvtps2ph $4, %ymm2, %xmm2
+; AVX-NEXT: vcvtph2ps %xmm2, %ymm2
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX-NEXT: addq $120, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
+; AVX-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_UINT_TO_FP_no_inf16:
@@ -296,128 +115,29 @@ entry:
define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) {
; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: subq $120, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 128
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpsrld $16, %xmm3, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX-NEXT: vpsrld $16, %xmm5, %xmm5
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vpsrld $16, %xmm2, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
-; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,3,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,1,3,3]
-; AVX-NEXT: callq __truncsfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX-NEXT: addq $120, %rsp
-; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
;
; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
>From edb53f6723fca777543c8e9c9ed48fab70e57ff0 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 16:54:01 +0530
Subject: [PATCH 09/10] Fix `+f16c` case
---
.../SelectionDAG/LegalizeVectorOps.cpp | 9 ++--
.../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 52 ++++++-------------
2 files changed, 23 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1cbee72cfc7ce4..234dbacb6d2dfd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1780,9 +1780,12 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
// If STRICT_/FMUL is not supported by the target (in case of f16) replace the
// UINT_TO_FP with a larger float and round to the smaller type
if ((!IsStrict &&
- TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand) ||
- (IsStrict && TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) ==
- TargetLowering::Expand)) {
+ (TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand ||
+ TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Promote)) ||
+ (IsStrict && (TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) ==
+ TargetLowering::Expand ||
+ TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) ==
+ TargetLowering::Promote))) {
EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64;
SDValue UIToFP;
SDValue Result;
diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
index 74d2e042c10901..76eecdd72f8825 100644
--- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
+++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
@@ -10,15 +10,9 @@ define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) {
; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
-; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
-; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %ymm0
; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; AVX-NEXT: vzeroupper
@@ -65,39 +59,27 @@ entry:
define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) {
; AVX-LABEL: test_UINT_TO_FP_no_inf16:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX-NEXT: vandps %ymm2, %ymm0, %ymm3
-; AVX-NEXT: vcvtdq2ps %ymm3, %ymm3
-; AVX-NEXT: vcvtps2ph $4, %ymm3, %xmm3
-; AVX-NEXT: vcvtph2ps %xmm3, %ymm3
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm4
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpsrld $16, %xmm3, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0
; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %ymm0
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
-; AVX-NEXT: vmulps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %ymm0
-; AVX-NEXT: vaddps %ymm3, %ymm0, %ymm0
-; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vandps %ymm2, %ymm1, %ymm2
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX-NEXT: vpsrld $16, %xmm5, %xmm5
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX-NEXT: vcvtps2ph $4, %ymm2, %xmm2
-; AVX-NEXT: vcvtph2ps %xmm2, %ymm2
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm3
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
-; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
-; AVX-NEXT: vmulps %ymm4, %ymm1, %ymm1
-; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
-; AVX-NEXT: vcvtph2ps %xmm1, %ymm1
-; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
>From 25850f3f5181c7bc51dc1c927867c7fbaf018410 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik at intel.com>
Date: Tue, 7 Jan 2025 19:28:19 +0530
Subject: [PATCH 10/10] Address review comments
---
.../SelectionDAG/LegalizeVectorOps.cpp | 9 +-
.../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 133 ------------
llvm/test/CodeGen/X86/uint_to_half.ll | 200 ++++++++++++++++++
3 files changed, 202 insertions(+), 140 deletions(-)
delete mode 100644 llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
create mode 100644 llvm/test/CodeGen/X86/uint_to_half.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 234dbacb6d2dfd..89a00c5a4f0439 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1779,13 +1779,8 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
// If STRICT_/FMUL is not supported by the target (in case of f16) replace the
// UINT_TO_FP with a larger float and round to the smaller type
- if ((!IsStrict &&
- (TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand ||
- TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Promote)) ||
- (IsStrict && (TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) ==
- TargetLowering::Expand ||
- TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) ==
- TargetLowering::Promote))) {
+ if ((!IsStrict && !TLI.isOperationLegalOrCustom(ISD::FMUL, DstVT)) ||
+ (IsStrict && !TLI.isOperationLegalOrCustom(ISD::STRICT_FMUL, DstVT))) {
EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64;
SDValue UIToFP;
SDValue Result;
diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
deleted file mode 100644
index 76eecdd72f8825..00000000000000
--- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefix=AVX512
-
-define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) {
-; AVX-LABEL: test_UINT_TO_FP_no_inf8:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_UINT_TO_FP_no_inf8:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0
-; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %vec = uitofp <8 x i32> %a to <8 x half>
- ret <8 x half> %vec
-}
-
-define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) {
-; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf8:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0
-; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret <8 x half> %vec
-}
-
-define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) {
-; AVX-LABEL: test_UINT_TO_FP_no_inf16:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vpsrld $16, %xmm3, %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
-; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX-NEXT: vpsrld $16, %xmm5, %xmm5
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
-; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_UINT_TO_FP_no_inf16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512-NEXT: retq
-entry:
- %vec = uitofp <16 x i32> %a to <16 x half>
- ret <16 x half> %vec
-}
-
-define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) {
-; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vpsrld $16, %xmm3, %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
-; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX-NEXT: vpsrld $16, %xmm5, %xmm5
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2
-; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512-NEXT: retq
-entry:
- %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret <16 x half> %vec
-}
diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll
new file mode 100644
index 00000000000000..32339745ba75f9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/uint_to_half.ll
@@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefixes=AVX512
+
+define <8 x half> @test_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v8i32_v8f16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_uitofp_v8i32_v8f16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_uitofp_v8i32_v8f16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %vec = uitofp <8 x i32> %a to <8 x half>
+ ret <8 x half> %vec
+}
+
+define <8 x half> @test_strict_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <8 x half> %vec
+}
+
+define <16 x half> @test_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v16i32_v16f16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_uitofp_v16i32_v16f16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT: vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT: vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_uitofp_v16i32_v16f16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT: retq
+ %vec = uitofp <16 x i32> %a to <16 x half>
+ ret <16 x half> %vec
+}
+
+define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT: vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT: vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT: retq
+ %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <16 x half> %vec
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
More information about the llvm-commits
mailing list