[llvm] 67c3f2b - [X86] mayFoldIntoStore - peek through oneuse bitcase users to find a store node (#123366)

Sat Jan 18 00:38:29 PST 2025

Author: Simon Pilgrim
Date: 2025-01-18T14:08:24+05:30
New Revision: 67c3f2b4303972a6dc8ada54efe1d5d80d119a51

URL: https://github.com/llvm/llvm-project/commit/67c3f2b4303972a6dc8ada54efe1d5d80d119a51
DIFF: https://github.com/llvm/llvm-project/commit/67c3f2b4303972a6dc8ada54efe1d5d80d119a51.diff

LOG: [X86] mayFoldIntoStore - peek through oneuse bitcase users to find a store node (#123366)

mayFoldIntoStore currently just checks the direct (oneuse) user of a
SDValue to check its stored, which prevents cases where we bitcast the
value prior to storing (usually the bitcast will be removed later).

This patch peeks up through oneuse BITCAST nodes chain to see if its
eventually stored.

The main use of mayFoldIntoStore is v8i16 EXTRACT_VECTOR_ELT lowering
which will only use PEXTRW/PEXTRB for index0 extractions (vs the faster
MOVD) if the extracted value will be folded into a store on SSE41+
targets.

Fixes #107086

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
    llvm/test/CodeGen/X86/cvt16.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
    llvm/test/CodeGen/X86/fp16-libcalls.ll
    llvm/test/CodeGen/X86/half-constrained.ll
    llvm/test/CodeGen/X86/half-darwin.ll
    llvm/test/CodeGen/X86/half.ll
    llvm/test/CodeGen/X86/pr91005.ll
    llvm/test/CodeGen/X86/pr95278.ll
    llvm/test/CodeGen/X86/vector-half-conversions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 84736f18011a9d..8fe2781b148fcf 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2812,7 +2812,16 @@ bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
 }
 
 bool X86::mayFoldIntoStore(SDValue Op) {
-  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->user_begin());
+  if (!Op.hasOneUse())
+    return false;
+  // Peek through (oneuse) bitcast users
+  SDNode *User = *Op->user_begin();
+  while (User->getOpcode() == ISD::BITCAST) {
+    if (!User->hasOneUse())
+      return false;
+    User = *User->user_begin();
+  }
+  return ISD::isNormalStore(User);
 }
 
 bool X86::mayFoldIntoZeroExtend(SDValue Op) {

diff  --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index e911a24d830f75..04087c4f0dd5ed 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -53,8 +53,7 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    movw %ax, (%rdi)
+; AVX512-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
 entry:
   %val = load half, half addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index efc457e35e7f37..c6c088297c0ea6 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -34,8 +34,7 @@ define void @test1(float %src, ptr %dest) nounwind {
 ; F16C-LABEL: test1:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; SOFTFLOAT-LABEL: test1:

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index bf93c8a1f5b511..e1b677e8550944 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -316,8 +316,7 @@ define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    movw %ax, (%rsi)
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; AVX-NEXT:    retq
 ;
 ; X86-LABEL: fptrunc_float_to_f16:
@@ -411,8 +410,7 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    movw %ax, (%rdi)
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; X86-LABEL: fsqrt_f16:

diff  --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 0d8290b120fa4f..3af8b1aec1feb2 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -12,8 +12,7 @@ define void @test_half_ceil(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_ceil:
@@ -108,8 +107,7 @@ define void @test_half_cos(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq cosf at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -167,8 +165,7 @@ define void @test_half_exp(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq expf at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -226,8 +223,7 @@ define void @test_half_exp2(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq exp2f at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -285,8 +281,7 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq exp10f at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -342,8 +337,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_fabs:
@@ -383,8 +377,7 @@ define void @test_half_floor(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_floor:
@@ -438,8 +431,7 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; F16C-NEXT:    callq fmaf at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -525,8 +517,7 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_fneg:
@@ -568,8 +559,7 @@ define void @test_half_log(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq logf at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -627,8 +617,7 @@ define void @test_half_log2(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq log2f at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -686,8 +675,7 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq log10f at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -743,8 +731,7 @@ define void @test_half_nearbyint(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_nearbyint:
@@ -797,8 +784,7 @@ define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; F16C-NEXT:    callq powf at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -876,8 +862,7 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq __powisf2 at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -943,8 +928,7 @@ define void @test_half_rint(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_rint:
@@ -996,8 +980,7 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq sinf at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -1053,8 +1036,7 @@ define void @test_half_sqrt(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_sqrt:
@@ -1107,8 +1089,7 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq tanf at PLT
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rbx)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
 ;
@@ -1164,8 +1145,7 @@ define void @test_half_trunc(half %a0, ptr %p0) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; FP16-LABEL: test_half_trunc:

diff  --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index eae9b25e43e06f..0f73129d984bd9 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -176,8 +176,7 @@ define void @float_to_half(float %0) strictfp {
 ; X86-F16C:       # %bb.0:
 ; X86-F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X86-F16C-NEXT:    vmovd %xmm0, %eax
-; X86-F16C-NEXT:    movw %ax, a
+; X86-F16C-NEXT:    vpextrw $0, %xmm0, a
 ; X86-F16C-NEXT:    retl
 ;
 ; X64-NOF16C-LABEL: float_to_half:
@@ -197,9 +196,8 @@ define void @float_to_half(float %0) strictfp {
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT:    vmovd %xmm0, %eax
-; X64-F16C-NEXT:    movq a at GOTPCREL(%rip), %rcx
-; X64-F16C-NEXT:    movw %ax, (%rcx)
+; X64-F16C-NEXT:    movq a at GOTPCREL(%rip), %rax
+; X64-F16C-NEXT:    vpextrw $0, %xmm0, (%rax)
 ; X64-F16C-NEXT:    retq
   %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   store half %2, ptr @a, align 2
@@ -354,8 +352,7 @@ define void @add() strictfp {
 ; X86-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X86-F16C-NEXT:    vmovd %xmm0, %eax
-; X86-F16C-NEXT:    movw %ax, c
+; X86-F16C-NEXT:    vpextrw $0, %xmm0, c
 ; X86-F16C-NEXT:    retl
 ;
 ; X64-NOF16C-LABEL: add:
@@ -392,9 +389,8 @@ define void @add() strictfp {
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT:    vmovd %xmm0, %eax
-; X64-F16C-NEXT:    movq c at GOTPCREL(%rip), %rcx
-; X64-F16C-NEXT:    movw %ax, (%rcx)
+; X64-F16C-NEXT:    movq c at GOTPCREL(%rip), %rax
+; X64-F16C-NEXT:    vpextrw $0, %xmm0, (%rax)
 ; X64-F16C-NEXT:    retq
   %1 = load half, ptr @a, align 2
   %2 = tail call float @llvm.experimental.constrained.fpext.f32.f16(half %1, metadata !"fpexcept.strict") #0

diff  --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll
index 7388429143df56..3cbf5c11235ea8 100644
--- a/llvm/test/CodeGen/X86/half-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-darwin.ll
@@ -16,8 +16,7 @@ define void @truncsfhf(float %in, ptr %ptr) nounwind {
 ; CHECK-F16C-LABEL: truncsfhf:
 ; CHECK-F16C:       ## %bb.0:
 ; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-F16C-NEXT:    vmovd %xmm0, %eax
-; CHECK-F16C-NEXT:    movw %ax, (%rdi)
+; CHECK-F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; CHECK-F16C-NEXT:    retq
 ;
 ; CHECK-FP16-LABEL: truncsfhf:
@@ -108,8 +107,7 @@ define void @strict_truncsfhf(float %in, ptr %ptr) nounwind strictfp {
 ; CHECK-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-F16C-NEXT:    vmovd %xmm0, %eax
-; CHECK-F16C-NEXT:    movw %ax, (%rdi)
+; CHECK-F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; CHECK-F16C-NEXT:    retq
 ;
 ; CHECK-FP16-LABEL: strict_truncsfhf:

diff  --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 7bac075e486809..6e7f109a5da5c2 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -146,8 +146,7 @@ define void @test_trunc32(float %in, ptr %addr) #0 {
 ; BWON-F16C-LABEL: test_trunc32:
 ; BWON-F16C:       # %bb.0:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    movw %ax, (%rdi)
+; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_trunc32:
@@ -265,8 +264,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 {
 ; BWON-F16C:       # %bb.0:
 ; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    movw %ax, (%rsi)
+; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_sitofp_i64:
@@ -398,8 +396,7 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 {
 ; BWON-F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:  .LBB10_3:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    movw %ax, (%rsi)
+; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: test_uitofp_i64:
@@ -1075,8 +1072,7 @@ define void @main.158() #0 {
 ; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; BWON-F16C-NEXT:  .LBB20_2: # %entry
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovd %xmm0, %eax
-; BWON-F16C-NEXT:    movw %ax, (%rax)
+; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rax)
 ; BWON-F16C-NEXT:    retq
 ;
 ; CHECK-I686-LABEL: main.158:

diff  --git a/llvm/test/CodeGen/X86/pr91005.ll b/llvm/test/CodeGen/X86/pr91005.ll
index 97fd1ce4568826..d73cd7482c3904 100644
--- a/llvm/test/CodeGen/X86/pr91005.ll
+++ b/llvm/test/CodeGen/X86/pr91005.ll
@@ -16,8 +16,7 @@ define void @PR91005(ptr %0) minsize {
 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT:    vmovd %xmm0, %eax
-; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; CHECK-NEXT:  .LBB0_2: # %common.ret
 ; CHECK-NEXT:    retq
   %2 = bitcast <2 x half> poison to <2 x i16>

diff  --git a/llvm/test/CodeGen/X86/pr95278.ll b/llvm/test/CodeGen/X86/pr95278.ll
index 32783696f4692a..104fc04d68cdb2 100644
--- a/llvm/test/CodeGen/X86/pr95278.ll
+++ b/llvm/test/CodeGen/X86/pr95278.ll
@@ -8,8 +8,7 @@ define void @PR95278(ptr %p0, ptr %p1) {
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT:    vmovd %xmm0, %eax
-; CHECK-NEXT:    movw %ax, (%rsi)
+; CHECK-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %load = load <1024 x half>, ptr %p0, align 2

diff  --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index ec916060563a75..4e50b56323311f 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -2596,15 +2596,13 @@ define void @store_cvt_f32_to_i16(float %a0, ptr %a1) nounwind {
 ; F16C-LABEL: store_cvt_f32_to_i16:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT:    vmovd %xmm0, %eax
-; F16C-NEXT:    movw %ax, (%rdi)
+; F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: store_cvt_f32_to_i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    movw %ax, (%rdi)
+; AVX512-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   %1 = fptrunc float %a0 to half
   %2 = bitcast half %1 to i16