[llvm] [X86] Fold (f16 bitcast extract_vectorelt(v,0)) to (extract_vectorelt (v8f16 bitcast(v,0))) (PR #125877)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 5 08:21:26 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Also handles possible truncations from i32 to i16.
Cleans up some of the poor codegen identified in #<!-- -->98630
---
Patch is 85.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125877.diff
14 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+13)
- (modified) llvm/test/CodeGen/X86/bfloat.ll (-12)
- (modified) llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll (-6)
- (modified) llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll (+189-239)
- (modified) llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll (+4-28)
- (modified) llvm/test/CodeGen/X86/fp-round.ll (-2)
- (modified) llvm/test/CodeGen/X86/fp-roundeven.ll (-2)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll (-12)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll (-24)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll (-14)
- (modified) llvm/test/CodeGen/X86/half.ll (+57-87)
- (modified) llvm/test/CodeGen/X86/pr31088.ll (-8)
- (modified) llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll (-8)
- (modified) llvm/test/CodeGen/X86/vector-half-conversions.ll (+104-123)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6cf6061deba702..b0cebea5f29880 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45160,6 +45160,19 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
}
}
+ // Attempt to peek through f16 bitcasted extractions hidden by truncation.
+ if (VT == MVT::f16 && SrcVT == MVT::i16) {
+ SDValue Src = peekThroughTruncates(N0);
+ if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Src.getOperand(0).getValueSizeInBits() == 128 &&
+ isNullConstant(Src.getOperand(1))) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
+ DAG.getVectorIdxConstant(0, DL));
+ }
+ }
+
// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index d67cd6b62c2b92..4d269cfff2afe6 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -82,8 +82,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; X86-NEXT: vmovd %eax, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; X86-NEXT: vmovw %xmm0, %eax
-; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add2:
@@ -110,8 +108,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: add2:
@@ -124,8 +120,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
-; AVXNC-NEXT: vmovd %xmm0, %eax
-; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVXNC-NEXT: retq
%add = fadd bfloat %a, %b
ret bfloat %add
@@ -432,8 +426,6 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; X86-NEXT: vmovw %xmm0, %eax
-; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add_constant2:
@@ -454,8 +446,6 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: add_constant2:
@@ -465,8 +455,6 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
-; AVXNC-NEXT: vmovd %xmm0, %eax
-; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVXNC-NEXT: retq
%add = fadd bfloat %a, 1.0
ret bfloat %add
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 04087c4f0dd5ed..556b0deaf4c830 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -154,8 +154,6 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
@@ -239,15 +237,11 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vmovd %xmm2, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512-NEXT: vmovd %xmm0, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index bfff6ef41dbe00..fbc3fbf1055f45 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1812,212 +1812,186 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
;
; AVX512-LABEL: test_fmaximumnum_v4f16:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $88, %rsp
+; AVX512-NEXT: subq $72, %rsp
; AVX512-NEXT: vmovdqa %xmm1, %xmm4
-; AVX512-NEXT: vmovdqa %xmm0, %xmm6
+; AVX512-NEXT: vmovdqa %xmm0, %xmm8
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm2
+; AVX512-NEXT: vucomiss %xmm0, %xmm1
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm2
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm9
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm9
; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vxorps %xmm10, %xmm10, %xmm10
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm3
-; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
+; AVX512-NEXT: vucomiss %xmm3, %xmm2
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2
-; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
+; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm3
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5
-; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm2
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm5, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm4[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm5, %xmm5
+; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm8[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vucomiss %xmm7, %xmm7
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm5, %xmm5 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm15
-; AVX512-NEXT: vcvtph2ps %xmm15, %xmm5
-; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm5
+; AVX512-NEXT: vmovss %xmm2, %xmm7, %xmm7 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm14
+; AVX512-NEXT: vcvtph2ps %xmm14, %xmm7
+; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm7
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vmulss %xmm3, %xmm9, %xmm3
-; AVX512-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vxorps %xmm15, %xmm15, %xmm15
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5
+; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm3
+; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm0, %ecx
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7]
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm11
-; AVX512-NEXT: vcvtph2ps %xmm11, %xmm3
-; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12
+; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13
+; AVX512-NEXT: vcvtph2ps %xmm13, %xmm6
+; AVX512-NEXT: vucomiss %xmm6, %xmm1
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm1, %xmm6, %xmm6 {%k1}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm3
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm7
-; AVX512-NEXT: vcvtph2ps %xmm7, %xmm3
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm10
+; AVX512-NEXT: vcvtph2ps %xmm10, %xmm3
; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm12
-; AVX512-NEXT: vcvtph2ps %xmm12, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm11
+; AVX512-NEXT: vcvtph2ps %xmm11, %xmm5
+; AVX512-NEXT: vucomiss %xmm5, %xmm3
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2
-; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm14
-; AVX512-NEXT: vmovd %xmm14, %eax
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13
-; AVX512-NEXT: vmovd %xmm13, %ecx
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT: vmovss %xmm3, %xmm5, %xmm5 {%k1}
; AVX512-NEXT: vcvtph2ps %xmm4, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vcvtph2ps %xmm6, %xmm2
-; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/125877
More information about the llvm-commits
mailing list