[llvm] [X86][SDAG] Improve the lowering of `s|uitofp i8|i16 to half` (PR #70834)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 31 10:19:38 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (qcolombet)
<details>
<summary>Changes</summary>
Prior to this patch, vector `s|uitofp` from narrow types (`<= i16`) were scalarized when the hardware doesn't support fp16 conversions natively. This patch fixes that by avoiding using `i16` as an intermediate type when there is no hardware support conversion from this type to half. In other words, when the target doesn't support `avx512fp16`, we avoid using intermediate `i16` vectors for `s|uitofp` conversions.
Instead we extend the narrow type to `i32`, which will be converted to `float` and downcasted to `half`.
Put differently, we go from:
```
s|uitofp iNarrow %src to half
```
To
```
%tmp = s|zext iNarrow %src to i32
%tmpfp = s|uitofp i32 %tmp to float
fptrunc float %tmpfp to half
```
Note that this patch:
- Doesn't change the actual lowering of i32 to half. I.e., the `float` intermediate step and the final downcasting are what existed for this input type to half.
- Changes only the intermediate type for the lowering of `s|uitofp`. I.e., the first `s|zext` from i16 to i32.
Remark: The vector and scalar lowering of `s|uitofp` don't use the same code path. Not super happy about that, but I'm not planning to fix that, at least in this PR.
This fixes https://github.com/llvm/llvm-project/issues/67080
---
Patch is 45.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70834.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+30-16)
- (modified) llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll (+93-254)
- (added) llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll (+319)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 18f6a695e4502e9..2560bdbea3ce3b6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53390,19 +53390,26 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
- // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
- // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
+ // Using i16 as an intermediate type is a bad idea, unless we have HW support
+ // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
+ // if hasFP16 support:
+ // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
+ // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
+ // else
+ // UINT_TO_FP(vXi1~31) -> UINT_TO_FP(ZEXT(vXi1~31 to vXi32))
// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
unsigned ScalarSize = InVT.getScalarSizeInBits();
- if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+ if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
+ ScalarSize >= 64)
return SDValue();
SDLoc dl(N);
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
- ScalarSize < 16 ? MVT::i16
- : ScalarSize < 32 ? MVT::i32
- : MVT::i64,
- InVT.getVectorNumElements());
+ EVT DstVT =
+ EVT::getVectorVT(*DAG.getContext(),
+ (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
+ : ScalarSize < 32 ? MVT::i32
+ : MVT::i64,
+ InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
if (IsStrict)
return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
@@ -53453,19 +53460,26 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
- // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
- // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
+ // Using i16 as an intermediate type is a bad idea, unless we have HW support
+ // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
+ // if hasFP16 support:
+ // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
+ // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
+ // else
+ // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
unsigned ScalarSize = InVT.getScalarSizeInBits();
- if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+ if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
+ ScalarSize >= 64)
return SDValue();
SDLoc dl(N);
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
- ScalarSize < 16 ? MVT::i16
- : ScalarSize < 32 ? MVT::i32
- : MVT::i64,
- InVT.getVectorNumElements());
+ EVT DstVT =
+ EVT::getVectorVT(*DAG.getContext(),
+ (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
+ : ScalarSize < 32 ? MVT::i32
+ : MVT::i64,
+ InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 8d98ec7eaac2a4c..080ad3a7b0b463e 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -154,52 +154,44 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; CHECK-SSE-NEXT: pslld $16, %xmm0
+; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $2, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $4, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $6, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $2, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: cvtdq2ps (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $4, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pextrw $6, %xmm0, %eax
-; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
-; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
+; CHECK-SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -212,23 +204,23 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
@@ -254,12 +246,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0]
+; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: addq $88, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
@@ -273,52 +266,47 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; CHECK-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,21248,0,21248,0,21248,0,21248,0,21248,0,21248,0,21248,0,21248]
+; CHECK-AVX2-NEXT: vsubps %ymm1, %ymm2, %ymm1
+; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; CHECK-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; CHECK-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; CHECK-AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
-; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
; CHECK-AVX2-NEXT: callq __extendhfsf2 at PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -343,8 +331,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: callq __extendhfsf2 at PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
-; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -373,8 +361,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-AVX2-NEXT: callq __extendhfsf2 at PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2 at PLT
-; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -387,55 +375,11 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm1
-; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm1, %ymm0
-; CHECK-NO-FASTFMA-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm2, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; CHECK-NO-FASTFMA-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; CHECK-NO-FASTFMA-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm3, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
-; CHECK-NO-FASTFMA-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vmovd %xmm4, %eax
-; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4
-; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; CHECK-NO-FASTFMA-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-NO-FASTFMA-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-NO-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/70834
More information about the llvm-commits
mailing list