[llvm] [X86][SDAG] Improve the lowering of `s|uitofp i8|i16 to half` (PR #70834)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 1 06:02:29 PDT 2023
https://github.com/qcolombet updated https://github.com/llvm/llvm-project/pull/70834
>From c3d80cb7d16c5b1fc5239b717812e40ab3dbe4ca Mon Sep 17 00:00:00 2001
From: Quentin Colombet <quentin.colombet at gmail.com>
Date: Tue, 31 Oct 2023 16:50:24 +0100
Subject: [PATCH 1/2] [X86][SDAG] Improve the lowering of `s|uitofp i8|i16 to
half`
Prior to this patch, vector `s|uitofp` from narrow types (`<= i16`) were
scalarized when the hardware doesn't support fp16 conversions natively.
This patch fixes that by avoiding using `i16` as an intermediate type when
there is no hardware support conversion from this type to half. In other
words, when the target doesn't support `avx512fp16`, we avoid using
intermediate `i16` vectors for `s|uitofp` conversions.
Instead we extend the narrow type to `i32`, which will be converted to
`float` and downcasted to `half`.
Put differently, we go from:
```
s|uitofp iNarrow %src to half
```
To
```
%tmp = s|zext iNarrow %src to i32
%tmpfp = s|uitofp i32 %tmp to float
fptrunc float %tmpfp to half
```
Note that this patch:
- Doesn't change the actual lowering of i32 to half. I.e., the `float`
intermediate step and the final downcasting are what existed for this
input type to half.
- Changes only the intermediate type for the lowering of `s|uitofp`.
I.e., the first `s|zext` from i16 to i32.
Remark: The vector and scalar lowering of `s|uitofp` don't use the same
code path. Not super happy about that, but I'm not planning to fix that, at
least in this PR.
This fixes https://github.com/llvm/llvm-project/issues/67080
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 46 ++-
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 347 +++++-------------
.../CodeGen/X86/select-narrow-int-to-fp.ll | 319 ++++++++++++++++
3 files changed, 442 insertions(+), 270 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 18f6a695e4502e9..2560bdbea3ce3b6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53390,19 +53390,26 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
- // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
- // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
+ // Using i16 as an intermediate type is a bad idea, unless we have HW support
+ // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
+ // if hasFP16 support:
+ // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
+ // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
+ // else
+ // UINT_TO_FP(vXi1~31) -> UINT_TO_FP(ZEXT(vXi1~31 to vXi32))
// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
unsigned ScalarSize = InVT.getScalarSizeInBits();
- if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+ if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
+ ScalarSize >= 64)
return SDValue();
SDLoc dl(N);
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
- ScalarSize < 16 ? MVT::i16
- : ScalarSize < 32 ? MVT::i32
- : MVT::i64,
- InVT.getVectorNumElements());
+ EVT DstVT =
+ EVT::getVectorVT(*DAG.getContext(),
+ (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
+ : ScalarSize < 32 ? MVT::i32
+ : MVT::i64,
+ InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
if (IsStrict)
return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
@@ -53453,19 +53460,26 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
- // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
- // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
+ // Using i16 as an intermediate type is a bad idea, unless we have HW support
+ // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
+ // if hasFP16 support:
+ // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
+ // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
+ // else
+ // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
unsigned ScalarSize = InVT.getScalarSizeInBits();
- if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+ if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
+ ScalarSize >= 64)
return SDValue();
SDLoc dl(N);
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
- ScalarSize < 16 ? MVT::i16
- : ScalarSize < 32 ? MVT::i32
- : MVT::i64,
- InVT.getVectorNumElements());
+ EVT DstVT =
+ EVT::getVectorVT(*DAG.getContext(),
+ (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
+ : ScalarSize < 32 ? MVT::i32
+ : MVT::i64,
+ InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 8d98ec7eaac2a4c..080ad3a7b0b463e 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -154,52 +154,44 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; CHECK-SSE-NEXT: pslld $16, %xmm0
+; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $2, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $4, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $6, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $2, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: cvtdq2ps (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $4, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $6, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -212,23 +204,23 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
@@ -254,12 +246,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0]
+; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: addq $88, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
@@ -273,52 +266,47 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; CHECK-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,21248,0,21248,0,21248,0,21248,0,21248,0,21248,0,21248,0,21248]
+; CHECK-AVX2-NEXT: vsubps %ymm1, %ymm2, %ymm1
+; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; CHECK-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; CHECK-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: callq __extendhfsf2 at PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -343,8 +331,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: callq __extendhfsf2 at PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
-; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -373,8 +361,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: callq __extendhfsf2 at PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
-; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -387,55 +375,11 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm1
-; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm1, %ymm0
-; CHECK-NO-FASTFMA-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm2, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; CHECK-NO-FASTFMA-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm4, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; CHECK-NO-FASTFMA-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-NO-FASTFMA-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm4, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; CHECK-NO-FASTFMA-NEXT: vpextrw $0, %xmm1, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm1
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm1, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; CHECK-NO-FASTFMA-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NO-FASTFMA-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; CHECK-NO-FASTFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
+; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NO-FASTFMA-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; CHECK-NO-FASTFMA-NEXT: vcvtudq2ps %zmm0, %zmm0
+; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
@@ -447,53 +391,9 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-FMA-NEXT: vmovd %xmm1, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-FMA-NEXT: vmovd %xmm2, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-FMA-NEXT: vmovd %xmm3, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm2
-; CHECK-FMA-NEXT: vmovd %xmm2, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-FMA-NEXT: vmovd %xmm4, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm2
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-FMA-NEXT: vmovd %xmm2, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm3
-; CHECK-FMA-NEXT: vmovd %xmm3, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm0
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vmovd %xmm0, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-FMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
+; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
@@ -1145,18 +1045,17 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT: pextrw $1, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT: pxor %xmm0, %xmm0
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[1,1,1,1]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1167,9 +1066,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
+; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-SSE-NEXT: addq $40, %rsp
; CHECK-SSE-NEXT: retq
;
@@ -1205,29 +1103,14 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
+; CHECK-NO-FASTFMA-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,0,0,2,2,0,0]
+; CHECK-NO-FASTFMA-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
-; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm1
-; CHECK-NO-FASTFMA-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vpextrw $1, %xmm1, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm1, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NO-FASTFMA-NEXT: vmovaps {{.*#+}} xmm1 = [16,0,0,0]
-; CHECK-NO-FASTFMA-NEXT: xorl %eax, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm2, %eax
-; CHECK-NO-FASTFMA-NEXT: vmovd %eax, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vpbroadcastw %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vpermt2ps %zmm0, %zmm1, %zmm2
-; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm2, %ymm0
+; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm0
+; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NO-FASTFMA-NEXT: vcvtudq2ps %zmm0, %zmm0
+; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
@@ -1238,53 +1121,9 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-FMA-NEXT: vmovd %xmm1, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-FMA-NEXT: vmovd %xmm2, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-FMA-NEXT: vmovd %xmm3, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm2
-; CHECK-FMA-NEXT: vmovd %xmm2, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-FMA-NEXT: vmovd %xmm4, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm2
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-FMA-NEXT: vmovd %xmm2, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-FMA-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm3
-; CHECK-FMA-NEXT: vmovd %xmm3, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-FMA-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm0
-; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vmovd %xmm0, %eax
-; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-FMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-FMA-NEXT: vcvtudq2ps %ymm0, %ymm0
+; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
new file mode 100644
index 000000000000000..8f3e8013423e285
--- /dev/null
+++ b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -o - %s -mtriple=x86_64-unknown-unknown -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK-NO_FP16
+; RUN: llc -o - %s -mtriple=x86_64-unknown-unknown -mcpu=cascadelake -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK-WITH_FP16
+
+; Note: We could check more configurations, but anything with software
+; emulation of fp16 generates a ton of assembly code and is not particularly
+; interesting.
+
+;----------------------------------------
+; i8 input
+;----------------------------------------
+
+; uint8_t to float.
+; - Go from i8 to i32: zext
+; - Convert i32 to float
+define float @uint8ToFloat(i8 %int8) {
+; CHECK-NO_FP16-LABEL: uint8ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movzbl %dil, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: uint8ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movzbl %dil, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp i8 %int8 to float
+ ret float %fp32
+}
+
+; vector uint8_t to float.
+; Same as @uint8ToFloat but with vector types.
+define <16 x float> @vector_uint8ToFloat(<16 x i8> %int8) {
+; CHECK-NO_FP16-LABEL: vector_uint8ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_uint8ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-WITH_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp <16 x i8> %int8 to <16 x float>
+ ret <16 x float> %fp32
+}
+
+
+; uint8_t to half.
+;
+; If no half support:
+; - Go from i8 to i32: zext
+; - Convert i32 to float
+; - Trunc from float to half
+;
+; Else if half support:
+; - Go from i8 to i32: zext
+; - Convert i32 to half
+define half @uint8ToHalf(i8 %int8) {
+; CHECK-NO_FP16-LABEL: uint8ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movzbl %dil, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax
+; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: uint8ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movzbl %dil, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp i8 %int8 to half
+ ret half %fp32
+}
+
+; vector uint8_t to half.
+;
+; If no half support:
+; - Go from i8 to i32: zext
+; - Convert i32 to float
+; - Trunc from float to half
+;
+; Else if half support:
+; - Go from i8 to i16: zext
+; - Convert i16 to half
+;
+; The difference with the scalar version (uint8ToHalf) is that we use i16
+; for the intermediate type when we have half support.
+define <16 x half> @vector_uint8ToHalf(<16 x i8> %int8) {
+; CHECK-NO_FP16-LABEL: vector_uint8ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_uint8ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-WITH_FP16-NEXT: vcvtuw2ph %ymm0, %ymm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp <16 x i8> %int8 to <16 x half>
+ ret <16 x half> %fp32
+}
+
+; Same as uint8_t but with the signed variant.
+; I.e., use sext instead of zext.
+define float @sint8ToFloat(i8 %int8) {
+; CHECK-NO_FP16-LABEL: sint8ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movsbl %dil, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: sint8ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movsbl %dil, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp i8 %int8 to float
+ ret float %fp32
+}
+
+define <16 x float> @vector_sint8ToFloat(<16 x i8> %int8) {
+; CHECK-NO_FP16-LABEL: vector_sint8ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovsxbd %xmm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_sint8ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vpmovsxbd %xmm0, %zmm0
+; CHECK-WITH_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp <16 x i8> %int8 to <16 x float>
+ ret <16 x float> %fp32
+}
+
+define half @sint8ToHalf(i8 %int8) {
+; CHECK-NO_FP16-LABEL: sint8ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movsbl %dil, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax
+; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: sint8ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movsbl %dil, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp i8 %int8 to half
+ ret half %fp32
+}
+
+define <16 x half> @vector_sint8ToHalf(<16 x i8> %int8) {
+; CHECK-NO_FP16-LABEL: vector_sint8ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovsxbd %xmm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_sint8ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vpmovsxbw %xmm0, %ymm0
+; CHECK-WITH_FP16-NEXT: vcvtw2ph %ymm0, %ymm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp <16 x i8> %int8 to <16 x half>
+ ret <16 x half> %fp32
+}
+
+
+;----------------------------------------
+; i16 input
+;----------------------------------------
+
+; Similar lowering as i8, but with i16 as the input type.
+
+define float @uint16ToFloat(i16 %int16) {
+; CHECK-NO_FP16-LABEL: uint16ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movzwl %di, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: uint16ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movzwl %di, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp i16 %int16 to float
+ ret float %fp32
+}
+
+define <16 x float> @vector_uint16ToFloat(<16 x i16> %int16) {
+; CHECK-NO_FP16-LABEL: vector_uint16ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_uint16ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-WITH_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp <16 x i16> %int16 to <16 x float>
+ ret <16 x float> %fp32
+}
+
+define half @uint16ToHalf(i16 %int16) {
+; CHECK-NO_FP16-LABEL: uint16ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movzwl %di, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax
+; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: uint16ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movzwl %di, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp i16 %int16 to half
+ ret half %fp32
+}
+
+define <16 x half> @vector_uint16ToHalf(<16 x i16> %int16) {
+; CHECK-NO_FP16-LABEL: vector_uint16ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_uint16ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vcvtuw2ph %ymm0, %ymm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = uitofp <16 x i16> %int16 to <16 x half>
+ ret <16 x half> %fp32
+}
+
+define float @sint16ToFloat(i16 %int16) {
+; CHECK-NO_FP16-LABEL: sint16ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movswl %di, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: sint16ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movswl %di, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp i16 %int16 to float
+ ret float %fp32
+}
+
+define <16 x float> @vector_sint16ToFloat(<16 x i16> %int16) {
+; CHECK-NO_FP16-LABEL: vector_sint16ToFloat:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovsxwd %ymm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_sint16ToFloat:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vpmovsxwd %ymm0, %zmm0
+; CHECK-WITH_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp <16 x i16> %int16 to <16 x float>
+ ret <16 x float> %fp32
+}
+
+define half @sint16ToHalf(i16 %int16) {
+; CHECK-NO_FP16-LABEL: sint16ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: movswl %di, %eax
+; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: vmovd %xmm0, %eax
+; CHECK-NO_FP16-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: sint16ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: movswl %di, %eax
+; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp i16 %int16 to half
+ ret half %fp32
+}
+
+define <16 x half> @vector_sint16ToHalf(<16 x i16> %int16) {
+; CHECK-NO_FP16-LABEL: vector_sint16ToHalf:
+; CHECK-NO_FP16: # %bb.0:
+; CHECK-NO_FP16-NEXT: vpmovsxwd %ymm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; CHECK-NO_FP16-NEXT: retq
+;
+; CHECK-WITH_FP16-LABEL: vector_sint16ToHalf:
+; CHECK-WITH_FP16: # %bb.0:
+; CHECK-WITH_FP16-NEXT: vcvtw2ph %ymm0, %ymm0
+; CHECK-WITH_FP16-NEXT: retq
+ %fp32 = sitofp <16 x i16> %int16 to <16 x half>
+ ret <16 x half> %fp32
+}
>From 4e1aab475bdbe058381f2468bf0021aaacf9534a Mon Sep 17 00:00:00 2001
From: Quentin Colombet <quentin.colombet at gmail.com>
Date: Wed, 1 Nov 2023 14:02:01 +0100
Subject: [PATCH 2/2] Use x86-64-v4 instead of specific CPU name
---
llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
index 8f3e8013423e285..15bea0dd4a46950 100644
--- a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
+++ b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -o - %s -mtriple=x86_64-unknown-unknown -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK-NO_FP16
-; RUN: llc -o - %s -mtriple=x86_64-unknown-unknown -mcpu=cascadelake -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK-WITH_FP16
+; RUN: llc -o - %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK-NO_FP16
+; RUN: llc -o - %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK-WITH_FP16
; Note: We could check more configurations, but anything with software
; emulation of fp16 generates a ton of assembly code and is not particularly
More information about the llvm-commits
mailing list