[libc-commits] [libc] [libcxx] [lld] [clang] [llvm] [compiler-rt] [flang] Fix ISel crash when lowering BUILD_VECTOR (PR #73186)

David Li via libc-commits libc-commits at lists.llvm.org
Fri Nov 24 09:29:59 PST 2023


https://github.com/david-xl updated https://github.com/llvm/llvm-project/pull/73186

>From c2fbdaa37ceb2f478464801fd1fc2dfbffc77cad Mon Sep 17 00:00:00 2001
From: David Li <davidxl at google.com>
Date: Mon, 20 Nov 2023 12:35:55 -0800
Subject: [PATCH 1/2] Enable customer lowering for fabs_v16f16 with AVX2

---
 llvm/lib/Target/X86/X86ISelLowering.cpp |    2 +
 llvm/test/CodeGen/X86/vec_fabs.ll       | 1748 +----------------------
 2 files changed, 25 insertions(+), 1725 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7a4fa16edb7de49..011baa545dd82fe 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1596,6 +1596,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STORE,              VT, Custom);
     }
     setF16Action(MVT::v16f16, Expand);
+    if (Subtarget.hasAVX2())
+      setOperationAction(ISD::FABS, MVT::v16f16, Custom);
     setOperationAction(ISD::FADD, MVT::v16f16, Expand);
     setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
     setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index f691cb76bc684e6..ececfce210f563d 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -515,564 +515,17 @@ define <16 x half> @fabs_v16f16(ptr %p) {
 ;
 ; X86-AVX2-LABEL: fabs_v16f16:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    subl $372, %esp # imm = 0x174
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 380
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovaps 16(%esi), %xmm1
-; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 4(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 20(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 8(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 24(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 12(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 28(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vpand {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm1, (%esp)
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X86-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X86-AVX2-NEXT:    addl $372, %esp # imm = 0x174
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX2-NEXT:    vpand (%eax), %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
-; X86-AVX512VL-LABEL: fabs_v16f16:
-; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512VL-NEXT:    movzwl 28(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X86-AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vmovd %xmm1, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
-; X86-AVX512VL-NEXT:    vmovdqa (%eax), %xmm1
-; X86-AVX512VL-NEXT:    vmovdqa 16(%eax), %xmm2
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm4, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-AVX512VL-NEXT:    movzwl 12(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X86-AVX512VL-NEXT:    movzwl 24(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VL-NEXT:    movzwl 8(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm6, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm6
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vmovd %xmm6, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm6
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X86-AVX512VL-NEXT:    movzwl 20(%eax), %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VL-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VL-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VL-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VL-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VL-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VL-NEXT:    movzwl 4(%eax), %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm6, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm6
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VL-NEXT:    vmovd %xmm6, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm2
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X86-AVX512VL-NEXT:    vmovd %xmm2, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VL-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VL-NEXT:    movzwl %ax, %eax
-; X86-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X86-AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X86-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X86-AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X86-AVX512VL-NEXT:    vmovd %xmm0, %eax
-; X86-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X86-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X86-AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X86-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X86-AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X86-AVX512VL-NEXT:    retl
-;
-; X86-AVX512FP16-LABEL: fabs_v16f16:
-; X86-AVX512FP16:       # %bb.0:
-; X86-AVX512FP16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; X86-AVX512FP16-NEXT:    vpand (%eax), %ymm0, %ymm0
-; X86-AVX512FP16-NEXT:    retl
-;
-; X86-AVX512VLDQ-LABEL: fabs_v16f16:
-; X86-AVX512VLDQ:       # %bb.0:
-; X86-AVX512VLDQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512VLDQ-NEXT:    movzwl 28(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X86-AVX512VLDQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm1, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
-; X86-AVX512VLDQ-NEXT:    vmovdqa (%eax), %xmm1
-; X86-AVX512VLDQ-NEXT:    vmovdqa 16(%eax), %xmm2
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-AVX512VLDQ-NEXT:    movzwl 12(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X86-AVX512VLDQ-NEXT:    movzwl 24(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VLDQ-NEXT:    movzwl 8(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm6, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X86-AVX512VLDQ-NEXT:    movzwl 20(%eax), %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm4, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
-; X86-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    movzwl %cx, %ecx
-; X86-AVX512VLDQ-NEXT:    vmovd %ecx, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %ecx
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-AVX512VLDQ-NEXT:    movzwl 4(%eax), %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm6, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm2
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm2, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X86-AVX512VLDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X86-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X86-AVX512VLDQ-NEXT:    vmovd %eax, %xmm1
-; X86-AVX512VLDQ-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X86-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X86-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X86-AVX512VLDQ-NEXT:    vmovd %xmm0, %eax
-; X86-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X86-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X86-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X86-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X86-AVX512VLDQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X86-AVX512VLDQ-NEXT:    retl
+; X86-AVX512-LABEL: fabs_v16f16:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX512-NEXT:    vpand (%eax), %ymm0, %ymm0
+; X86-AVX512-NEXT:    retl
 ;
 ; X64-AVX1-LABEL: fabs_v16f16:
 ; X64-AVX1:       # %bb.0:
@@ -1209,448 +662,15 @@ define <16 x half> @fabs_v16f16(ptr %p) {
 ;
 ; X64-AVX2-LABEL: fabs_v16f16:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    subq $128, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 144
-; X64-AVX2-NEXT:    .cfi_offset %rbx, -16
-; X64-AVX2-NEXT:    movq %rdi, %rbx
-; X64-AVX2-NEXT:    vpinsrw $0, 28(%rdi), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X64-AVX2-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps (%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa 16(%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 24(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 8(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 20(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X64-AVX2-NEXT:    addq $128, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    popq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX2-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    retq
 ;
-; X64-AVX512VL-LABEL: fabs_v16f16:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    movzwl 28(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm0
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X64-AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vmovd %xmm1, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
-; X64-AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
-; X64-AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm2
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm4, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-AVX512VL-NEXT:    movzwl 12(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X64-AVX512VL-NEXT:    movzwl 24(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VL-NEXT:    movzwl 8(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X64-AVX512VL-NEXT:    movzwl 20(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VL-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VL-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VL-NEXT:    movzwl 4(%rdi), %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VL-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm2
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X64-AVX512VL-NEXT:    vmovd %xmm2, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VL-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VL-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VL-NEXT:    movzwl %ax, %eax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X64-AVX512VL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X64-AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-AVX512VL-NEXT:    vmovd %xmm0, %eax
-; X64-AVX512VL-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X64-AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X64-AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512VL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X64-AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X64-AVX512VL-NEXT:    retq
-;
-; X64-AVX512FP16-LABEL: fabs_v16f16:
-; X64-AVX512FP16:       # %bb.0:
-; X64-AVX512FP16-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; X64-AVX512FP16-NEXT:    vpand (%rdi), %ymm0, %ymm0
-; X64-AVX512FP16-NEXT:    retq
-;
-; X64-AVX512VLDQ-LABEL: fabs_v16f16:
-; X64-AVX512VLDQ:       # %bb.0:
-; X64-AVX512VLDQ-NEXT:    movzwl 28(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm0
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm0, %xmm1
-; X64-AVX512VLDQ-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm1, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
-; X64-AVX512VLDQ-NEXT:    vmovdqa (%rdi), %xmm1
-; X64-AVX512VLDQ-NEXT:    vmovdqa 16(%rdi), %xmm2
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-AVX512VLDQ-NEXT:    movzwl 12(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; X64-AVX512VLDQ-NEXT:    movzwl 24(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VLDQ-NEXT:    movzwl 8(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; X64-AVX512VLDQ-NEXT:    movzwl 20(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm4, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
-; X64-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm2, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-AVX512VLDQ-NEXT:    movzwl 4(%rdi), %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrlq $48, %xmm1, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm6, %xmm6
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm6, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm6
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm2
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm2, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm5, %xmm5
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm5, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
-; X64-AVX512VLDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, %eax
-; X64-AVX512VLDQ-NEXT:    movzwl %ax, %eax
-; X64-AVX512VLDQ-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VLDQ-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X64-AVX512VLDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; X64-AVX512VLDQ-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; X64-AVX512VLDQ-NEXT:    vmovd %xmm0, %eax
-; X64-AVX512VLDQ-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; X64-AVX512VLDQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X64-AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512VLDQ-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
-; X64-AVX512VLDQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; X64-AVX512VLDQ-NEXT:    retq
+; X64-AVX512-LABEL: fabs_v16f16:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX512-NEXT:    vpand (%rdi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    retq
   %v = load <16 x half>, ptr %p, align 32
   %nnv = call <16 x half> @llvm.fabs.v16f16(<16 x half> %v)
   ret <16 x half> %nnv
@@ -2215,481 +1235,10 @@ define <32 x half> @fabs_v32f16(ptr %p) {
 ;
 ; X86-AVX2-LABEL: fabs_v32f16:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    subl $708, %esp # imm = 0x2C4
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 716
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX2-NEXT:    vmovdqa 32(%esi), %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqa 48(%esi), %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 36(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 52(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 40(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 56(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 44(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 60(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm1
-; X86-AVX2-NEXT:    vmovdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovaps 16(%esi), %xmm0
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vpsrld $16, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 4(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 20(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 8(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 24(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 12(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovd %xmm0, (%esp)
-; X86-AVX2-NEXT:    vpinsrw $0, 28(%esi), %xmm0, %xmm0
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpextrw $0, %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vandps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __extendhfsf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX2-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX2-NEXT:    vpand {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vmovss %xmm1, (%esp)
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; X86-AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; X86-AVX2-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    calll __truncsfhf2
-; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X86-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X86-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; X86-AVX2-NEXT:    # ymm1 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 # 32-byte Reload
-; X86-AVX2-NEXT:    addl $708, %esp # imm = 0x2C4
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX2-NEXT:    vpand (%eax), %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpand 32(%eax), %ymm1, %ymm1
 ; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512VL-LABEL: fabs_v32f16:
@@ -3597,260 +2146,9 @@ define <32 x half> @fabs_v32f16(ptr %p) {
 ;
 ; X64-AVX2-LABEL: fabs_v32f16:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    subq $192, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 208
-; X64-AVX2-NEXT:    .cfi_offset %rbx, -16
-; X64-AVX2-NEXT:    movq %rdi, %rbx
-; X64-AVX2-NEXT:    vpinsrw $0, 28(%rdi), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; X64-AVX2-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps (%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa 16(%rbx), %xmm1
-; X64-AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps 32(%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps 48(%rbx), %xmm0
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 12(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 24(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 8(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 20(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 4(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 60(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 44(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 56(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 40(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 52(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vpinsrw $0, 36(%rbx), %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X64-AVX2-NEXT:    callq __extendhfsf2 at PLT
-; X64-AVX2-NEXT:    vpand (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    callq __truncsfhf2 at PLT
-; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-AVX2-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; X64-AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; X64-AVX2-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; X64-AVX2-NEXT:    # ymm1 = ymm0[0],mem[0],ymm0[2],mem[2]
-; X64-AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; X64-AVX2-NEXT:    addq $192, %rsp
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:    popq %rbx
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX2-NEXT:    vpand (%rdi), %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpand 32(%rdi), %ymm1, %ymm1
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512VL-LABEL: fabs_v32f16:

>From 9cf05a0f584e4e227234a7db276f905fd3373547 Mon Sep 17 00:00:00 2001
From: David Li <davidxl at google.com>
Date: Wed, 22 Nov 2023 15:40:33 -0800
Subject: [PATCH 2/2] Fix crash in instruction selection

---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp |  16 +-
 llvm/test/CodeGen/X86/shuffle-half.ll   | 339 ++++++++++++++++++++++++
 2 files changed, 349 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 93e184eca9bc515..5d0230dbe0a823d 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -881,6 +881,10 @@ static bool isEndbrImm64(uint64_t Imm) {
   return false;
 }
 
+static bool needBWI(MVT VT) {
+  return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
+}
+
 void X86DAGToDAGISel::PreprocessISelDAG() {
   bool MadeChange = false;
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
@@ -986,15 +990,15 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     case X86ISD::VBROADCAST: {
       MVT VT = N->getSimpleValueType(0);
       // Emulate v32i16/v64i8 broadcast without BWI.
-      if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
-        MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+      if (!Subtarget->hasBWI() && needBWI(VT)) {
+        MVT NarrowVT = VT.getHalfNumVectorElementsVT();
         SDLoc dl(N);
         SDValue NarrowBCast =
             CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
         SDValue Res =
             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
-        unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+        unsigned Index = NarrowVT.getVectorMinNumElements();
         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
                               CurDAG->getIntPtrConstant(Index, dl));
 
@@ -1010,8 +1014,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     case X86ISD::VBROADCAST_LOAD: {
       MVT VT = N->getSimpleValueType(0);
       // Emulate v32i16/v64i8 broadcast without BWI.
-      if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
-        MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+      if (!Subtarget->hasBWI() && needBWI(VT)) {
+        MVT NarrowVT = VT.getHalfNumVectorElementsVT();
         auto *MemNode = cast<MemSDNode>(N);
         SDLoc dl(N);
         SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
@@ -1022,7 +1026,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
         SDValue Res =
             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
-        unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+        unsigned Index = NarrowVT.getVectorMinNumElements();
         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
                               CurDAG->getIntPtrConstant(Index, dl));
 
diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll
index 0529ca1a0b82c1d..64f3264d6d6bb14 100644
--- a/llvm/test/CodeGen/X86/shuffle-half.ll
+++ b/llvm/test/CodeGen/X86/shuffle-half.ll
@@ -308,4 +308,343 @@ define <32 x half> @dump_vec() {
   ret <32 x half> %1
 }
 
+define <32 x half> @build_vec(ptr %p, <32 x i1> %mask) {
+; CHECK-LABEL: build_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0
+; CHECK-NEXT:    vpmovmskb %ymm0, %eax
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %cond.load
+; CHECK-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT:    testb $2, %al
+; CHECK-NEXT:    jne .LBB1_4
+; CHECK-NEXT:    jmp .LBB1_5
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; CHECK-NEXT:    testb $2, %al
+; CHECK-NEXT:    je .LBB1_5
+; CHECK-NEXT:  .LBB1_4: # %cond.load1
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:  .LBB1_5: # %else2
+; CHECK-NEXT:    testb $4, %al
+; CHECK-NEXT:    jne .LBB1_6
+; CHECK-NEXT:  # %bb.7: # %else5
+; CHECK-NEXT:    testb $8, %al
+; CHECK-NEXT:    jne .LBB1_8
+; CHECK-NEXT:  .LBB1_9: # %else8
+; CHECK-NEXT:    testb $16, %al
+; CHECK-NEXT:    jne .LBB1_10
+; CHECK-NEXT:  .LBB1_11: # %else11
+; CHECK-NEXT:    testb $32, %al
+; CHECK-NEXT:    jne .LBB1_12
+; CHECK-NEXT:  .LBB1_13: # %else14
+; CHECK-NEXT:    testb $64, %al
+; CHECK-NEXT:    jne .LBB1_14
+; CHECK-NEXT:  .LBB1_15: # %else17
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    js .LBB1_16
+; CHECK-NEXT:  .LBB1_17: # %else20
+; CHECK-NEXT:    testl $256, %eax # imm = 0x100
+; CHECK-NEXT:    jne .LBB1_18
+; CHECK-NEXT:  .LBB1_19: # %else23
+; CHECK-NEXT:    testl $512, %eax # imm = 0x200
+; CHECK-NEXT:    jne .LBB1_20
+; CHECK-NEXT:  .LBB1_21: # %else26
+; CHECK-NEXT:    testl $1024, %eax # imm = 0x400
+; CHECK-NEXT:    jne .LBB1_22
+; CHECK-NEXT:  .LBB1_23: # %else29
+; CHECK-NEXT:    testl $2048, %eax # imm = 0x800
+; CHECK-NEXT:    jne .LBB1_24
+; CHECK-NEXT:  .LBB1_25: # %else32
+; CHECK-NEXT:    testl $4096, %eax # imm = 0x1000
+; CHECK-NEXT:    jne .LBB1_26
+; CHECK-NEXT:  .LBB1_27: # %else35
+; CHECK-NEXT:    testl $8192, %eax # imm = 0x2000
+; CHECK-NEXT:    jne .LBB1_28
+; CHECK-NEXT:  .LBB1_29: # %else38
+; CHECK-NEXT:    testl $16384, %eax # imm = 0x4000
+; CHECK-NEXT:    jne .LBB1_30
+; CHECK-NEXT:  .LBB1_31: # %else41
+; CHECK-NEXT:    testw %ax, %ax
+; CHECK-NEXT:    js .LBB1_32
+; CHECK-NEXT:  .LBB1_33: # %else44
+; CHECK-NEXT:    testl $65536, %eax # imm = 0x10000
+; CHECK-NEXT:    jne .LBB1_34
+; CHECK-NEXT:  .LBB1_35: # %else47
+; CHECK-NEXT:    testl $131072, %eax # imm = 0x20000
+; CHECK-NEXT:    jne .LBB1_36
+; CHECK-NEXT:  .LBB1_37: # %else50
+; CHECK-NEXT:    testl $262144, %eax # imm = 0x40000
+; CHECK-NEXT:    jne .LBB1_38
+; CHECK-NEXT:  .LBB1_39: # %else53
+; CHECK-NEXT:    testl $524288, %eax # imm = 0x80000
+; CHECK-NEXT:    jne .LBB1_40
+; CHECK-NEXT:  .LBB1_41: # %else56
+; CHECK-NEXT:    testl $1048576, %eax # imm = 0x100000
+; CHECK-NEXT:    jne .LBB1_42
+; CHECK-NEXT:  .LBB1_43: # %else59
+; CHECK-NEXT:    testl $2097152, %eax # imm = 0x200000
+; CHECK-NEXT:    jne .LBB1_44
+; CHECK-NEXT:  .LBB1_45: # %else62
+; CHECK-NEXT:    testl $4194304, %eax # imm = 0x400000
+; CHECK-NEXT:    jne .LBB1_46
+; CHECK-NEXT:  .LBB1_47: # %else65
+; CHECK-NEXT:    testl $8388608, %eax # imm = 0x800000
+; CHECK-NEXT:    jne .LBB1_48
+; CHECK-NEXT:  .LBB1_49: # %else68
+; CHECK-NEXT:    testl $16777216, %eax # imm = 0x1000000
+; CHECK-NEXT:    jne .LBB1_50
+; CHECK-NEXT:  .LBB1_51: # %else71
+; CHECK-NEXT:    testl $33554432, %eax # imm = 0x2000000
+; CHECK-NEXT:    jne .LBB1_52
+; CHECK-NEXT:  .LBB1_53: # %else74
+; CHECK-NEXT:    testl $67108864, %eax # imm = 0x4000000
+; CHECK-NEXT:    jne .LBB1_54
+; CHECK-NEXT:  .LBB1_55: # %else77
+; CHECK-NEXT:    testl $134217728, %eax # imm = 0x8000000
+; CHECK-NEXT:    jne .LBB1_56
+; CHECK-NEXT:  .LBB1_57: # %else80
+; CHECK-NEXT:    testl $268435456, %eax # imm = 0x10000000
+; CHECK-NEXT:    jne .LBB1_58
+; CHECK-NEXT:  .LBB1_59: # %else83
+; CHECK-NEXT:    testl $536870912, %eax # imm = 0x20000000
+; CHECK-NEXT:    jne .LBB1_60
+; CHECK-NEXT:  .LBB1_61: # %else86
+; CHECK-NEXT:    testl $1073741824, %eax # imm = 0x40000000
+; CHECK-NEXT:    jne .LBB1_62
+; CHECK-NEXT:  .LBB1_63: # %else89
+; CHECK-NEXT:    testl $-2147483648, %eax # imm = 0x80000000
+; CHECK-NEXT:    jne .LBB1_64
+; CHECK-NEXT:  .LBB1_65: # %else92
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_6: # %cond.load4
+; CHECK-NEXT:    vpbroadcastw 4(%rdi), %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    testb $8, %al
+; CHECK-NEXT:    je .LBB1_9
+; CHECK-NEXT:  .LBB1_8: # %cond.load7
+; CHECK-NEXT:    vpbroadcastw 6(%rdi), %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    testb $16, %al
+; CHECK-NEXT:    je .LBB1_11
+; CHECK-NEXT:  .LBB1_10: # %cond.load10
+; CHECK-NEXT:    vpbroadcastw 8(%rdi), %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    testb $32, %al
+; CHECK-NEXT:    je .LBB1_13
+; CHECK-NEXT:  .LBB1_12: # %cond.load13
+; CHECK-NEXT:    vpbroadcastw 10(%rdi), %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    testb $64, %al
+; CHECK-NEXT:    je .LBB1_15
+; CHECK-NEXT:  .LBB1_14: # %cond.load16
+; CHECK-NEXT:    vpbroadcastw 12(%rdi), %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jns .LBB1_17
+; CHECK-NEXT:  .LBB1_16: # %cond.load19
+; CHECK-NEXT:    vpbroadcastw 14(%rdi), %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $256, %eax # imm = 0x100
+; CHECK-NEXT:    je .LBB1_19
+; CHECK-NEXT:  .LBB1_18: # %cond.load22
+; CHECK-NEXT:    vpbroadcastw 16(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testl $512, %eax # imm = 0x200
+; CHECK-NEXT:    je .LBB1_21
+; CHECK-NEXT:  .LBB1_20: # %cond.load25
+; CHECK-NEXT:    vpbroadcastw 18(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testl $1024, %eax # imm = 0x400
+; CHECK-NEXT:    je .LBB1_23
+; CHECK-NEXT:  .LBB1_22: # %cond.load28
+; CHECK-NEXT:    vpbroadcastw 20(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testl $2048, %eax # imm = 0x800
+; CHECK-NEXT:    je .LBB1_25
+; CHECK-NEXT:  .LBB1_24: # %cond.load31
+; CHECK-NEXT:    vpbroadcastw 22(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testl $4096, %eax # imm = 0x1000
+; CHECK-NEXT:    je .LBB1_27
+; CHECK-NEXT:  .LBB1_26: # %cond.load34
+; CHECK-NEXT:    vpbroadcastw 24(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testl $8192, %eax # imm = 0x2000
+; CHECK-NEXT:    je .LBB1_29
+; CHECK-NEXT:  .LBB1_28: # %cond.load37
+; CHECK-NEXT:    vpbroadcastw 26(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testl $16384, %eax # imm = 0x4000
+; CHECK-NEXT:    je .LBB1_31
+; CHECK-NEXT:  .LBB1_30: # %cond.load40
+; CHECK-NEXT:    vpbroadcastw 28(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testw %ax, %ax
+; CHECK-NEXT:    jns .LBB1_33
+; CHECK-NEXT:  .LBB1_32: # %cond.load43
+; CHECK-NEXT:    vpbroadcastw 30(%rdi), %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; CHECK-NEXT:    testl $65536, %eax # imm = 0x10000
+; CHECK-NEXT:    je .LBB1_35
+; CHECK-NEXT:  .LBB1_34: # %cond.load46
+; CHECK-NEXT:    vpbroadcastw 32(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $131072, %eax # imm = 0x20000
+; CHECK-NEXT:    je .LBB1_37
+; CHECK-NEXT:  .LBB1_36: # %cond.load49
+; CHECK-NEXT:    vpbroadcastw 34(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $262144, %eax # imm = 0x40000
+; CHECK-NEXT:    je .LBB1_39
+; CHECK-NEXT:  .LBB1_38: # %cond.load52
+; CHECK-NEXT:    vpbroadcastw 36(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $524288, %eax # imm = 0x80000
+; CHECK-NEXT:    je .LBB1_41
+; CHECK-NEXT:  .LBB1_40: # %cond.load55
+; CHECK-NEXT:    vpbroadcastw 38(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $1048576, %eax # imm = 0x100000
+; CHECK-NEXT:    je .LBB1_43
+; CHECK-NEXT:  .LBB1_42: # %cond.load58
+; CHECK-NEXT:    vpbroadcastw 40(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $2097152, %eax # imm = 0x200000
+; CHECK-NEXT:    je .LBB1_45
+; CHECK-NEXT:  .LBB1_44: # %cond.load61
+; CHECK-NEXT:    vpbroadcastw 42(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $4194304, %eax # imm = 0x400000
+; CHECK-NEXT:    je .LBB1_47
+; CHECK-NEXT:  .LBB1_46: # %cond.load64
+; CHECK-NEXT:    vpbroadcastw 44(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $8388608, %eax # imm = 0x800000
+; CHECK-NEXT:    je .LBB1_49
+; CHECK-NEXT:  .LBB1_48: # %cond.load67
+; CHECK-NEXT:    vpbroadcastw 46(%rdi), %xmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $16777216, %eax # imm = 0x1000000
+; CHECK-NEXT:    je .LBB1_51
+; CHECK-NEXT:  .LBB1_50: # %cond.load70
+; CHECK-NEXT:    vpbroadcastw 48(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $33554432, %eax # imm = 0x2000000
+; CHECK-NEXT:    je .LBB1_53
+; CHECK-NEXT:  .LBB1_52: # %cond.load73
+; CHECK-NEXT:    vpbroadcastw 50(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7,8],ymm1[9],ymm2[10,11,12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $67108864, %eax # imm = 0x4000000
+; CHECK-NEXT:    je .LBB1_55
+; CHECK-NEXT:  .LBB1_54: # %cond.load76
+; CHECK-NEXT:    vpbroadcastw 52(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7,8,9],ymm1[10],ymm2[11,12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $134217728, %eax # imm = 0x8000000
+; CHECK-NEXT:    je .LBB1_57
+; CHECK-NEXT:  .LBB1_56: # %cond.load79
+; CHECK-NEXT:    vpbroadcastw 54(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7,8,9,10],ymm1[11],ymm2[12,13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $268435456, %eax # imm = 0x10000000
+; CHECK-NEXT:    je .LBB1_59
+; CHECK-NEXT:  .LBB1_58: # %cond.load82
+; CHECK-NEXT:    vpbroadcastw 56(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $536870912, %eax # imm = 0x20000000
+; CHECK-NEXT:    je .LBB1_61
+; CHECK-NEXT:  .LBB1_60: # %cond.load85
+; CHECK-NEXT:    vpbroadcastw 58(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7,8,9,10,11,12],ymm1[13],ymm2[14,15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $1073741824, %eax # imm = 0x40000000
+; CHECK-NEXT:    je .LBB1_63
+; CHECK-NEXT:  .LBB1_62: # %cond.load88
+; CHECK-NEXT:    vpbroadcastw 60(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    testl $-2147483648, %eax # imm = 0x80000000
+; CHECK-NEXT:    je .LBB1_65
+; CHECK-NEXT:  .LBB1_64: # %cond.load91
+; CHECK-NEXT:    vpbroadcastw 62(%rdi), %ymm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %p, i32 2, <32 x i1 > %mask, <32 x half> <half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0, half 2.0>)
+  ret <32 x half> %1
+}
+
 declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>)



More information about the libc-commits mailing list