[llvm] [X86] Avoid zero extend i16 when inserting fp16 (PR #126194)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 6 23:23:26 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Phoebe Wang (phoebewang)
<details>
<summary>Changes</summary>
---
Patch is 85.78 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126194.diff
16 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+4-2)
- (modified) llvm/test/CodeGen/X86/avx512-insert-extract.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/avx512-vec-cmp.ll (+2-4)
- (modified) llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll (+22-22)
- (modified) llvm/test/CodeGen/X86/fminimum-fmaximum.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll (+137-137)
- (modified) llvm/test/CodeGen/X86/fp-round.ll (+2-3)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll (+129-216)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll (+67-81)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll (+24-36)
- (modified) llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll (+15-29)
- (modified) llvm/test/CodeGen/X86/fpclamptosat_vec.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/half-constrained.ll (+28-24)
- (modified) llvm/test/CodeGen/X86/half-darwin.ll (+3-2)
- (modified) llvm/test/CodeGen/X86/half.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/pr116153.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 744e4e740cb210..96b140c9805f43 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21878,9 +21878,11 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
}
In = DAG.getBitcast(MVT::i16, In);
- In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
- getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
+ In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+ In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+ getZeroVector(MVT::v4i32, Subtarget, DAG, DL), In,
DAG.getVectorIdxConstant(0, DL));
+ In = DAG.getBitcast(MVT::v8i16, In);
SDValue Res;
if (IsStrict) {
Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 7ce37c637a79ca..fef29eb95b173c 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2166,7 +2166,7 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
; KNL-NEXT: setb %al
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k0
-; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; KNL-NEXT: vpsrld $16, %xmm0, %xmm0
; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
; KNL-NEXT: vucomiss %xmm2, %xmm0
; KNL-NEXT: setb %al
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 5ce2b56cbd43a0..210513fe31783e 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1443,8 +1443,7 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; KNL: ## %bb.0: ## %entry
; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; KNL-NEXT: vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
-; KNL-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
+; KNL-NEXT: vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; KNL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
@@ -1470,8 +1469,7 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; AVX512BW: ## %bb.0: ## %entry
; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; AVX512BW-NEXT: vpshuflw $85, %xmm0, %xmm1 ## encoding: [0xc5,0xfb,0x70,0xc8,0x55]
-; AVX512BW-NEXT: ## xmm1 = xmm0[1,1,1,1,4,5,6,7]
+; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9]
; AVX512BW-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 556b0deaf4c830..90975e912d8859 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -43,15 +43,15 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
;
; AVX512-LABEL: v_test_canonicalize__half:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: movzwl (%rdi), %eax
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX512-NEXT: retq
@@ -144,12 +144,12 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
@@ -228,21 +228,21 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
; AVX512-LABEL: v_test_canonicalize_v2half:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm2
-; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: vmulss %xmm1, %xmm3, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX512-NEXT: vmovd %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index 0530c843acfe67..d87a5085103178 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -1854,9 +1854,9 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; AVX512-NEXT: cmovpl %ecx, %r8d
; AVX512-NEXT: movl $0, %r11d
; AVX512-NEXT: cmoval %ecx, %r11d
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm2
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm3
; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512-NEXT: vucomiss %xmm2, %xmm3
; AVX512-NEXT: movl $0, %r10d
@@ -1872,9 +1872,9 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; AVX512-NEXT: cmovpl %ecx, %ebx
; AVX512-NEXT: movl $0, %r14d
; AVX512-NEXT: cmoval %ecx, %r14d
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
+; AVX512-NEXT: vpsrld $16, %xmm1, %xmm2
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7]
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm3
; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512-NEXT: vucomiss %xmm2, %xmm3
; AVX512-NEXT: movl $0, %r15d
@@ -1916,7 +1916,7 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3
; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; AVX512-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,1,1,4,5,6,7]
+; AVX512-NEXT: vpsrld $16, %xmm2, %xmm3
; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512-NEXT: vucomiss %xmm4, %xmm3
@@ -1937,7 +1937,7 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; AVX512-NEXT: cmovnel %eax, %edx
; AVX512-NEXT: cmovpl %eax, %edx
; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %xmm2, %xmm5
; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
; AVX512-NEXT: vucomiss %xmm4, %xmm5
; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index c7f5e13cb74647..7610579337811d 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1813,14 +1813,14 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; AVX512-LABEL: test_fmaximumnum_v4f16:
; AVX512: # %bb.0:
; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vmovdqa %xmm1, %xmm4
-; AVX512-NEXT: vmovdqa %xmm0, %xmm8
+; AVX512-NEXT: vmovdqa %xmm1, %xmm3
+; AVX512-NEXT: vmovdqa %xmm0, %xmm6
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
@@ -1840,67 +1840,64 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm9
-; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,3,3,3]
+; AVX512-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vucomiss %xmm4, %xmm4
+; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm5
+; AVX512-NEXT: vmovss %xmm5, %xmm2, %xmm2 {%k1}
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm2
+; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm4
+; AVX512-NEXT: vucomiss %xmm4, %xmm5
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1}
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: vucomiss %xmm5, %xmm5
+; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm5
+; AVX512-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm7
+; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm5
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0]
+; AVX512-NEXT: vmovss %xmm7, %xmm5, %xmm5 {%k1}
+; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm8[1,0]
+; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm6[1,0]
; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
; AVX512-NEXT: vucomiss %xmm7, %xmm7
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
; AVX512-NEXT: vmovss %xmm2, %xmm7, %xmm7 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm14
-; AVX512-NEXT: vcvtph2ps %xmm14, %xmm7
+; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm13
+; AVX512-NEXT: vcvtph2ps %xmm13, %xmm7
; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1909,180 +1906,183 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vxorps %xmm15, %xmm15, %xmm15
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5
-; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm0
+; AVX512-NEXT: vpxor %xmm14, %xmm14, %xmm14
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7]
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm15
+; AVX512-NEXT: vmulss %xmm0, %xmm15, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm3
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vmulss %xmm0, %xmm15, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmulss %xmm0, %xmm15, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5
+; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vmulss %xmm0, %xmm15, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %xmm6, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12
-; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm11
+; AVX512-NEXT: vcvtph2ps %xmm11, %xmm1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13
-; AVX512-NEXT: vcvtph2ps %xmm13, %xmm6
-; AVX512-NEXT: vucomiss %xmm6...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/126194
More information about the llvm-commits
mailing list