[llvm] 30e8dcd - Enable customer lowering for fabs_v16f16 with AVX2 (#72914)

via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 21 10:34:04 PST 2023


Author: David Li
Date: 2023-11-21T10:34:00-08:00
New Revision: 30e8dcd8ccbac748e9e45e8c94bad6e5645352e4

URL: https://github.com/llvm/llvm-project/commit/30e8dcd8ccbac748e9e45e8c94bad6e5645352e4
DIFF: https://github.com/llvm/llvm-project/commit/30e8dcd8ccbac748e9e45e8c94bad6e5645352e4.diff

LOG: Enable customer lowering for fabs_v16f16 with AVX2 (#72914)

This is part-2 change to improve codegen for vec_fabs. In this patch,
v16f16 and v132f16 fabs are improved.

There will be at least two followups patches after this one.
1) fixing the ISEL crash when fabs.v32f16 uses custom lowering with
AVX512
2) better expansion for v16f16, v32f16 types on AVX1 subtargets.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vec_fabs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7a4fa16edb7de49..011baa545dd82fe 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1596,6 +1596,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STORE,              VT, Custom);
     }
     setF16Action(MVT::v16f16, Expand);
+    if (Subtarget.hasAVX2())
+      setOperationAction(ISD::FABS, MVT::v16f16, Custom);
     setOperationAction(ISD::FADD, MVT::v16f16, Expand);
     setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
     setOperationAction(ISD::FMUL, MVT::v16f16, Expand);

diff  --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index f691cb76bc684e6..ececfce210f563d 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -515,564 +515,17 @@ define <16 x half> @fabs_v16f16(ptr %p) {
 ;
 ; X86-AVX2-LABEL: fabs_v16f16:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    subl $372, %esp # imm = 0x174
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 380
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovaps 16(%esi), %xmm1
-; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 4(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 20(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 8(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 24(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 12(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 28(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vpand {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm1, (%esp)
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X86-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X86-AVX2-NEXT:    addl $372, %esp # imm = 0x174
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX2-NEXT:    vpand (%eax), %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
-; X86-AVX512VL-LABEL: fabs_v16f16:
-; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512VL-NEXT:    movzwl 28(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X86-AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vmovd %xmm1, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
-; X86-AVX512VL-NEXT:    vmovdqa (%eax), %xmm1
-; X86-AVX512VL-NEXT:    vmovdqa 16(%eax), %xmm2
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm4, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-AVX512VL-NEXT:    movzwl 12(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X86-AVX512VL-NEXT:    movzwl 24(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VL-NEXT:    movzwl 8(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm6, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm6
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vmovd %xmm6, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm6
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X86-AVX512VL-NEXT:    movzwl 20(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VL-NEXT:    movzwl 4(%eax), %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm6, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm6
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vmovd %xmm6, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm2
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vmovd %xmm2, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X86-AVX512VL-NEXT:    vmovd %xmm0, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X86-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X86-AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X86-AVX512VL-NEXT:    retl
-;
-; X86-AVX512FP16-LABEL: fabs_v16f16:
-; X86-AVX512FP16:       # %bb.0:
-; X86-AVX512FP16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; X86-AVX512FP16-NEXT:    vpand (%eax), %ymm0, %ymm0
-; X86-AVX512FP16-NEXT:    retl
-;
-; X86-AVX512VLDQ-LABEL: fabs_v16f16:
-; X86-AVX512VLDQ:       # %bb.0:
-; X86-AVX512VLDQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512VLDQ-NEXT:    movzwl 28(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X86-AVX512VLDQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm1, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
-; X86-AVX512VLDQ-NEXT:    vmovdqa (%eax), %xmm1
-; X86-AVX512VLDQ-NEXT:    vmovdqa 16(%eax), %xmm2
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-AVX512VLDQ-NEXT:    movzwl 12(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X86-AVX512VLDQ-NEXT:    movzwl 24(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VLDQ-NEXT:    movzwl 8(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm6, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X86-AVX512VLDQ-NEXT:    movzwl 20(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VLDQ-NEXT:    movzwl 4(%eax), %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm6, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm2
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm2, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm1
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm0, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X86-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X86-AVX512VLDQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X86-AVX512VLDQ-NEXT:    retl
+; X86-AVX512-LABEL: fabs_v16f16:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX512-NEXT:    vpand (%eax), %ymm0, %ymm0
+; X86-AVX512-NEXT:    retl
 ;
 ; X64-AVX1-LABEL: fabs_v16f16:
 ; X64-AVX1:       # %bb.0:
@@ -1209,448 +662,15 @@ define <16 x half> @fabs_v16f16(ptr %p) {
 ;
 ; X64-AVX2-LABEL: fabs_v16f16:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    subq $128, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 144
-; X64-AVX2-NEXT:    .cfi_offset %rbx, -16
-; X64-AVX2-NEXT:    movq %rdi, %rbx
-; X64-AVX2-NEXT:    vpinsrw $0, 28(%rdi), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X64-AVX2-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps (%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa 16(%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 24(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 8(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 20(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X64-AVX2-NEXT:    addq $128, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    popq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX2-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    retq
 ;
-; X64-AVX512VL-LABEL: fabs_v16f16:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    movzwl 28(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm0
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X64-AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vmovd %xmm1, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
-; X64-AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
-; X64-AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm2
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm4, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-AVX512VL-NEXT:    movzwl 12(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X64-AVX512VL-NEXT:    movzwl 24(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VL-NEXT:    movzwl 8(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X64-AVX512VL-NEXT:    movzwl 20(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VL-NEXT:    movzwl 4(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm2
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vmovd %xmm2, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-AVX512VL-NEXT:    vmovd %xmm0, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X64-AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X64-AVX512VL-NEXT:    retq
-;
-; X64-AVX512FP16-LABEL: fabs_v16f16:
-; X64-AVX512FP16:       # %bb.0:
-; X64-AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; X64-AVX512FP16-NEXT:    vpand (%rdi), %ymm0, %ymm0
-; X64-AVX512FP16-NEXT:    retq
-;
-; X64-AVX512VLDQ-LABEL: fabs_v16f16:
-; X64-AVX512VLDQ:       # %bb.0:
-; X64-AVX512VLDQ-NEXT:    movzwl 28(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm0
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X64-AVX512VLDQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm1, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
-; X64-AVX512VLDQ-NEXT:    vmovdqa (%rdi), %xmm1
-; X64-AVX512VLDQ-NEXT:    vmovdqa 16(%rdi), %xmm2
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-AVX512VLDQ-NEXT:    movzwl 12(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X64-AVX512VLDQ-NEXT:    movzwl 24(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VLDQ-NEXT:    movzwl 8(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X64-AVX512VLDQ-NEXT:    movzwl 20(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VLDQ-NEXT:    movzwl 4(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm2
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm2, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm0, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X64-AVX512VLDQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X64-AVX512VLDQ-NEXT:    retq
+; X64-AVX512-LABEL: fabs_v16f16:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX512-NEXT:    vpand (%rdi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    retq
   %v = load <16 x half>, ptr %p, align 32
   %nnv = call <16 x half> @llvm.fabs.v16f16(<16 x half> %v)
   ret <16 x half> %nnv
@@ -2215,481 +1235,10 @@ define <32 x half> @fabs_v32f16(ptr %p) {
 ;
 ; X86-AVX2-LABEL: fabs_v32f16:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    subl $708, %esp # imm = 0x2C4
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 716
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX2-NEXT:    vmovdqa 32(%esi), %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqa 48(%esi), %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 36(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 52(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 40(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 56(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 44(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 60(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm1
-; X86-AVX2-NEXT:    vmovdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovaps 16(%esi), %xmm0
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpsrld $16, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 4(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 20(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 8(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 24(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 12(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 28(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vpand {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm1, (%esp)
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; X86-AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X86-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm1 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 # 32-byte Reload
-; X86-AVX2-NEXT:    addl $708, %esp # imm = 0x2C4
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX2-NEXT:    vpand (%eax), %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpand 32(%eax), %ymm1, %ymm1
 ; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512VL-LABEL: fabs_v32f16:
@@ -3597,260 +2146,9 @@ define <32 x half> @fabs_v32f16(ptr %p) {
 ;
 ; X64-AVX2-LABEL: fabs_v32f16:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    subq $192, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 208
-; X64-AVX2-NEXT:    .cfi_offset %rbx, -16
-; X64-AVX2-NEXT:    movq %rdi, %rbx
-; X64-AVX2-NEXT:    vpinsrw $0, 28(%rdi), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X64-AVX2-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps (%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa 16(%rbx), %xmm1
-; X64-AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps 32(%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps 48(%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 24(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 8(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 20(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 60(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 44(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 56(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 40(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 52(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 36(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm1 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X64-AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; X64-AVX2-NEXT:    addq $192, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    popq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX2-NEXT:    vpand (%rdi), %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpand 32(%rdi), %ymm1, %ymm1
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512VL-LABEL: fabs_v32f16:


        


More information about the llvm-commits mailing list