[llvm] [WIP][DAG] SimplifyDemandedVectorElts - add handling for INT<->FP conversions (PR #117884)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 2 01:16:59 PST 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/117884
>From 479f61adc4dfaacda2f2f610b3cf30ca4656b8a7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Nov 2024 14:09:08 +0000
Subject: [PATCH] [DAG] SimplifyDemandedVectorElts - add handling for INT<->FP
conversions
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 8 ++
llvm/test/CodeGen/PowerPC/pr38087.ll | 4 +-
.../CodeGen/X86/avx10_2fptosi_satcvtds.ll | 3 +-
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 107 +++++++++---------
llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 12 +-
llvm/test/CodeGen/X86/freeze-vector.ll | 11 +-
llvm/test/CodeGen/X86/pr50609.ll | 3 +-
.../CodeGen/X86/vector-half-conversions.ll | 8 +-
llvm/test/CodeGen/X86/widen_conv-3.ll | 4 +-
9 files changed, 79 insertions(+), 81 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index bd4bcadb57d7a9..ed637401310f70 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3736,6 +3736,14 @@ bool TargetLowering::SimplifyDemandedVectorElts(
KnownUndef.clearAllBits();
}
break;
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
+ KnownZero, TLO, Depth + 1))
+ return true;
+ break;
default: {
if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
diff --git a/llvm/test/CodeGen/PowerPC/pr38087.ll b/llvm/test/CodeGen/PowerPC/pr38087.ll
index 1216fa9cf8f260..933bf12cddaa62 100644
--- a/llvm/test/CodeGen/PowerPC/pr38087.ll
+++ b/llvm/test/CodeGen/PowerPC/pr38087.ll
@@ -11,9 +11,9 @@ declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0
define void @draw_llvm_vs_variant0(<4 x float> %x) {
; CHECK-LABEL: draw_llvm_vs_variant0:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lxsd v3, 0(r3)
-; CHECK-NEXT: vmrghh v3, v3, v3
+; CHECK-NEXT: lxsihzx v3, 0, r3
; CHECK-NEXT: vextsh2w v3, v3
+; CHECK-NEXT: xxmrghw v3, v3, v3
; CHECK-NEXT: xvcvsxwsp vs0, v3
; CHECK-NEXT: xxspltw vs0, vs0, 2
; CHECK-NEXT: xvmaddasp vs0, v2, v2
diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
index 4a6556bdc4a919..494e4bc8e068e4 100644
--- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
@@ -85,8 +85,7 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: vcvttpd2qq %xmm1, %xmm1
+; X86-NEXT: vcvttpd2qq %xmm0, %xmm1
; X86-NEXT: vmovd %xmm1, %esi
; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 332fbf7188af81..2163121410553f 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -141,56 +141,61 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-LABEL: fmul_pow2_8xhalf:
; CHECK-SSE: # %bb.0:
-; CHECK-SSE-NEXT: subq $88, %rsp
-; CHECK-SSE-NEXT: .cfi_def_cfa_offset 96
+; CHECK-SSE-NEXT: subq $104, %rsp
+; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112
; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; CHECK-SSE-NEXT: pslld $23, %xmm1
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
; CHECK-SSE-NEXT: paddd %xmm2, %xmm1
; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; CHECK-SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: pslld $16, %xmm1
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: pslld $16, %xmm0
-; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT: psrlq $48, %xmm0
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
-; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: cvtdq2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT: psrlq $48, %xmm0
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
@@ -202,39 +207,39 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
@@ -246,14 +251,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0]
-; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE-NEXT: addq $88, %rsp
+; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-SSE-NEXT: addq $104, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
@@ -1028,17 +1032,17 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,2,u,u,u,u,u,u]
-; CHECK-SSE-NEXT: pxor %xmm0, %xmm0
-; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
+; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,u,u,u,u,u,u]
+; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT: psrld $16, %xmm0
+; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = mem[1,1,1,1]
-; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1049,8 +1053,9 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-SSE-NEXT: callq __extendhfsf2 at PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2 at PLT
-; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: addq $40, %rsp
; CHECK-SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 6aad4c2ebba1d8..2dedb10d42fb4d 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -731,7 +731,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
;
; AVX512-LABEL: stest_f16i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512-NEXT: vpmovsqd %ymm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -894,7 +894,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
;
; AVX512-LABEL: utesth_f16i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512-NEXT: vpmovusqd %ymm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -1031,7 +1031,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
;
; AVX512-LABEL: ustest_f16i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
@@ -3343,7 +3343,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
;
; AVX512-LABEL: stest_f16i32_mm:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512-NEXT: vpmovsqd %ymm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3504,7 +3504,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
;
; AVX512-LABEL: utesth_f16i32_mm:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512-NEXT: vpmovusqd %ymm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3640,7 +3640,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
;
; AVX512-LABEL: ustest_f16i32_mm:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index fe240286462e9a..362b3b945f9622 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -630,13 +630,8 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: orl $1, %eax
-; X86-NEXT: vmovd %eax, %xmm1
-; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -651,10 +646,6 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
; X64-NEXT: pushq %rbx
; X64-NEXT: movq %rsi, %rbx
; X64-NEXT: vmovd %edi, %xmm0
-; X64-NEXT: orl $1, %edi
-; X64-NEXT: vmovd %edi, %xmm1
-; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: callq sinf at PLT
diff --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll
index 3e39c567d79e6d..c32363fcd5152c 100644
--- a/llvm/test/CodeGen/X86/pr50609.ll
+++ b/llvm/test/CodeGen/X86/pr50609.ll
@@ -8,7 +8,8 @@ define void @PR50609(ptr noalias nocapture %RET, ptr noalias %aFOO, <16 x i32> %
; CHECK-NEXT: vmovq %rsi, %xmm2
; CHECK-NEXT: vmovd %eax, %xmm3
; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2
-; CHECK-NEXT: vpsrad $31, %xmm2, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-NEXT: vpsrad $31, %xmm3, %xmm3
; CHECK-NEXT: vpsrld $30, %xmm3, %xmm3
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; CHECK-NEXT: vpsrad $2, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 5148d1566c6294..62ee0b298ba917 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -4966,8 +4966,6 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
;
; F16C-LABEL: fptosi_2f16_to_4i32:
; F16C: # %bb.0:
-; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvttps2dq %xmm0, %xmm0
; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
@@ -4975,8 +4973,6 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
;
; AVX512-LABEL: fptosi_2f16_to_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
@@ -5104,8 +5100,6 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
;
; AVX512-FASTLANE-LABEL: fptoui_2f16_to_4i32:
; AVX512-FASTLANE: # %bb.0:
-; AVX512-FASTLANE-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-FASTLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-FASTLANE-NEXT: vcvttps2udq %xmm0, %xmm0
; AVX512-FASTLANE-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
@@ -5212,7 +5206,7 @@ define <4 x i32> @fptoui_4f16_to_4i32(<4 x half> %a) nounwind {
;
; AVX512F-LABEL: fptoui_4f16_to_4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/widen_conv-3.ll b/llvm/test/CodeGen/X86/widen_conv-3.ll
index 5887834f9af2fb..f9b588b8b89158 100644
--- a/llvm/test/CodeGen/X86/widen_conv-3.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-3.ll
@@ -10,7 +10,7 @@ define void @convert_v2i16_to_v2f32(ptr %dst.addr, <2 x i16> %src) nounwind {
; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
; X86-SSE2: # %bb.0: # %entry
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE2-NEXT: psrad $16, %xmm0
; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE2-NEXT: movlps %xmm0, (%eax)
@@ -26,7 +26,7 @@ define void @convert_v2i16_to_v2f32(ptr %dst.addr, <2 x i16> %src) nounwind {
;
; X64-SSE2-LABEL: convert_v2i16_to_v2f32:
; X64-SSE2: # %bb.0: # %entry
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE2-NEXT: psrad $16, %xmm0
; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
More information about the llvm-commits
mailing list