[llvm] [X86][FP16] Customize MLOAD/MSTROE(vXf16) if VLX is not enabled (PR #142331)

Phoebe Wang via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 1 22:22:55 PDT 2025


https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/142331

Fixes: https://godbolt.org/z/fa4z97xsY

>From dedbbc78a2fdca213833272b9ce77ed5d46166ba Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Mon, 2 Jun 2025 13:17:22 +0800
Subject: [PATCH] [X86][FP16] Customize MLOAD/MSTROE(vXf16) if VLX is not
 enabled

Fixes: https://godbolt.org/z/fa4z97xsY
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  13 +-
 llvm/test/CodeGen/X86/avx512fp16-mov.ll | 736 +++++++++++++++++-------
 2 files changed, 545 insertions(+), 204 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b1a3e3c006bb3..2beb697548553 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2238,7 +2238,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
 
-    for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+    for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
+                    MVT::v16f16, MVT::v8f16}) {
       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
     }
@@ -33192,8 +33193,8 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
          "Cannot lower masked load op.");
 
   assert((ScalarVT.getSizeInBits() >= 32 ||
-          (Subtarget.hasBWI() &&
-              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+          (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
+                                  ScalarVT == MVT::f16))) &&
          "Unsupported masked load op.");
 
   // This operation is legal for targets with VLX, but without
@@ -33240,9 +33241,9 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
          "Cannot lower masked store op.");
 
   assert((ScalarVT.getSizeInBits() >= 32 ||
-          (Subtarget.hasBWI() &&
-              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
-          "Unsupported masked store op.");
+          (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
+                                  ScalarVT == MVT::f16))) &&
+         "Unsupported masked store op.");
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 82efaffe4014b..526511c850451 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,X64VL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64,X64-NOVL
 
 define <8 x half> @broadcastph128(ptr %x) {
 ; X64-LABEL: broadcastph128:
@@ -314,31 +315,47 @@ define <8 x half> @test14(half %x) {
 }
 
 define <16 x half> @test14b(half %x) {
-; X64-LABEL: test14b:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
-; X64-NEXT:    retq
+; X64VL-LABEL: test14b:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64VL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: test14b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: test14b:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-NOVL-NEXT:    retq
    %res = insertelement <16 x half>zeroinitializer, half %x, i32 0
    ret <16 x half>%res
 }
 
 define <32 x half> @test14c(half %x) {
-; X64-LABEL: test14c:
-; X64:       # %bb.0:
-; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
-; X64-NEXT:    retq
+; X64VL-LABEL: test14c:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64VL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: test14c:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: test14c:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; X64-NOVL-NEXT:    retq
    %res = insertelement <32 x half>zeroinitializer, half %x, i32 0
    ret <32 x half>%res
 }
@@ -578,13 +595,13 @@ declare void @llvm.masked.store.v32f16.p0(<32 x half>, ptr, i32, <32 x i1>)
 declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32,  <32 x i1>, <32 x half>)
 
 define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) {
-; X64-LABEL: storeu32f16mask:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
-; X64-NEXT:    vpmovb2m %ymm0, %k1
-; X64-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64VL-LABEL: storeu32f16mask:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64VL-NEXT:    vpmovb2m %ymm0, %k1
+; X64VL-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64VL-NEXT:    vzeroupper
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: storeu32f16mask:
 ; X86:       # %bb.0:
@@ -594,17 +611,25 @@ define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) {
 ; X86-NEXT:    vmovdqu16 %zmm1, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: storeu32f16mask:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64-NOVL-NEXT:    vpmovb2m %zmm0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   call void @llvm.masked.store.v32f16.p0(<32 x half> %val, ptr %addr, i32 4, <32 x i1>%mask)
   ret void
 }
 
 define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask) {
-; X64-LABEL: maskloadu32f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %ymm1, %ymm1
-; X64-NEXT:    vpmovb2m %ymm1, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskloadu32f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %ymm1, %ymm1
+; X64VL-NEXT:    vpmovb2m %ymm1, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskloadu32f16:
 ; X86:       # %bb.0:
@@ -613,17 +638,24 @@ define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskloadu32f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $7, %ymm1, %ymm1
+; X64-NOVL-NEXT:    vpmovb2m %zmm1, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NOVL-NEXT:    retq
   %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
   ret <32 x half> %res
 }
 
 define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) {
-; X64-LABEL: maskuloadu32f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
-; X64-NEXT:    vpmovb2m %ymm0, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskuloadu32f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64VL-NEXT:    vpmovb2m %ymm0, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskuloadu32f16:
 ; X86:       # %bb.0:
@@ -632,17 +664,24 @@ define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskuloadu32f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64-NOVL-NEXT:    vpmovb2m %zmm0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    retq
   %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
   ret <32 x half> %res
 }
 
 define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) {
-; X64-LABEL: maskzloadu32f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
-; X64-NEXT:    vpmovb2m %ymm0, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskzloadu32f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64VL-NEXT:    vpmovb2m %ymm0, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskzloadu32f16:
 ; X86:       # %bb.0:
@@ -651,6 +690,13 @@ define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskzloadu32f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64-NOVL-NEXT:    vpmovb2m %zmm0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    retq
   %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
   ret <32 x half> %res
 }
@@ -713,11 +759,11 @@ define <16 x half> @load16f16(ptr %a) {
 }
 
 define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
-; X64-LABEL: load16f16mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: load16f16mask:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: load16f16mask:
 ; X86:       # %bb.0:
@@ -725,6 +771,15 @@ define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: load16f16mask:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqa (%rdi), %ymm1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i16 %c to <16 x i1>
   %res0 = load <16 x half>, ptr %a
   %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
@@ -732,11 +787,11 @@ define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
 }
 
 define <16 x half> @load16f16maskz(ptr %a, i16 %c) {
-; X64-LABEL: load16f16maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: load16f16maskz:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: load16f16maskz:
 ; X86:       # %bb.0:
@@ -744,6 +799,14 @@ define <16 x half> @load16f16maskz(ptr %a, i16 %c) {
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: load16f16maskz:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqa (%rdi), %ymm0
+; X64-NOVL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i16 %c to <16 x i1>
   %res0 = load <16 x half>, ptr %a
   %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
@@ -766,11 +829,11 @@ define <16 x half> @loadu16f16(ptr %a) {
 }
 
 define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
-; X64-LABEL: loadu16f16mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: loadu16f16mask:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: loadu16f16mask:
 ; X86:       # %bb.0:
@@ -778,6 +841,15 @@ define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: loadu16f16mask:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqu (%rdi), %ymm1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i16 %c to <16 x i1>
   %res0 = load <16 x half>, ptr %a, align 8
   %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
@@ -785,11 +857,11 @@ define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
 }
 
 define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) {
-; X64-LABEL: loadu16f16maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: loadu16f16maskz:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: loadu16f16maskz:
 ; X86:       # %bb.0:
@@ -797,6 +869,14 @@ define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) {
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: loadu16f16maskz:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-NOVL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i16 %c to <16 x i1>
   %res0 = load <16 x half>, ptr %a, align 8
   %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
@@ -841,13 +921,13 @@ declare void @llvm.masked.store.v16f16.p0(<16 x half>, ptr, i32, <16 x i1>)
 declare <16 x half> @llvm.masked.load.v16f16.p0(ptr, i32,  <16 x i1>, <16 x half>)
 
 define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) {
-; X64-LABEL: storeu16f16mask:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
-; X64-NEXT:    vpmovb2m %xmm0, %k1
-; X64-NEXT:    vmovdqu16 %ymm1, (%rdi) {%k1}
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; X64VL-LABEL: storeu16f16mask:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64VL-NEXT:    vpmovb2m %xmm0, %k1
+; X64VL-NEXT:    vmovdqu16 %ymm1, (%rdi) {%k1}
+; X64VL-NEXT:    vzeroupper
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: storeu16f16mask:
 ; X86:       # %bb.0:
@@ -857,17 +937,27 @@ define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) {
 ; X86-NEXT:    vmovdqu16 %ymm1, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: storeu16f16mask:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X64-NOVL-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vpmovb2m %zmm0, %k0
+; X64-NOVL-NEXT:    kmovw %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   call void @llvm.masked.store.v16f16.p0(<16 x half> %val, ptr %addr, i32 4, <16 x i1>%mask)
   ret void
 }
 
 define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask) {
-; X64-LABEL: maskloadu16f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %xmm1, %xmm1
-; X64-NEXT:    vpmovb2m %xmm1, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskloadu16f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %xmm1, %xmm1
+; X64VL-NEXT:    vpmovb2m %xmm1, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskloadu16f16:
 ; X86:       # %bb.0:
@@ -876,17 +966,27 @@ define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskloadu16f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT:    vpsllw $7, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vpmovb2m %zmm1, %k0
+; X64-NOVL-NEXT:    kmovw %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
   ret <16 x half> %res
 }
 
 define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) {
-; X64-LABEL: maskuloadu16f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
-; X64-NEXT:    vpmovb2m %xmm0, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskuloadu16f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64VL-NEXT:    vpmovb2m %xmm0, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskuloadu16f16:
 ; X86:       # %bb.0:
@@ -895,17 +995,26 @@ define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskuloadu16f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vpmovb2m %zmm0, %k0
+; X64-NOVL-NEXT:    kmovw %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
   ret <16 x half> %res
 }
 
 define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) {
-; X64-LABEL: maskzloadu16f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
-; X64-NEXT:    vpmovb2m %xmm0, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskzloadu16f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64VL-NEXT:    vpmovb2m %xmm0, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskzloadu16f16:
 ; X86:       # %bb.0:
@@ -914,6 +1023,15 @@ define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskzloadu16f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vpmovb2m %zmm0, %k0
+; X64-NOVL-NEXT:    kmovw %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
   ret <16 x half> %res
 }
@@ -927,34 +1045,51 @@ define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
 }
 
 define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
-; X64-LABEL: movrrk16f16:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: movrrk16f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %edi, %k1
+; X64VL-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: movrrk16f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: movrrk16f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %edi, %k1
+; X64-NOVL-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %mask = bitcast i16 %msk to <16 x i1>
   %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
   ret <16 x half> %res
 }
 
 define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
-; X64-LABEL: movrrkz16f16:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: movrrkz16f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %edi, %k1
+; X64VL-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: movrrkz16f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: movrrkz16f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %edi, %k1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT:    retq
   %mask = bitcast i16 %msk to <16 x i1>
   %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
   ret <16 x half> %res
@@ -976,11 +1111,11 @@ define <8 x half> @load8f16(ptr %a) {
 }
 
 define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
-; X64-LABEL: load8f16mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: load8f16mask:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: load8f16mask:
 ; X86:       # %bb.0:
@@ -989,6 +1124,16 @@ define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
 ; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: load8f16mask:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqa (%rdi), %xmm1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i8 %c to <8 x i1>
   %res0 = load <8 x half>, ptr %a
   %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
@@ -996,11 +1141,11 @@ define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
 }
 
 define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
-; X64-LABEL: load8f16maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: load8f16maskz:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: load8f16maskz:
 ; X86:       # %bb.0:
@@ -1009,6 +1154,15 @@ define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
 ; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: load8f16maskz:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NOVL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i8 %c to <8 x i1>
   %res0 = load <8 x half>, ptr %a
   %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
@@ -1031,11 +1185,11 @@ define <8 x half> @loadu8f16(ptr %a) {
 }
 
 define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
-; X64-LABEL: loadu8f16mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: loadu8f16mask:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: loadu8f16mask:
 ; X86:       # %bb.0:
@@ -1044,6 +1198,16 @@ define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
 ; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: loadu8f16mask:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqu (%rdi), %xmm1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i8 %c to <8 x i1>
   %res0 = load <8 x half>, ptr %a, align 8
   %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
@@ -1051,11 +1215,11 @@ define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
 }
 
 define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
-; X64-LABEL: loadu8f16maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %esi, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: loadu8f16maskz:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %esi, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: loadu8f16maskz:
 ; X86:       # %bb.0:
@@ -1064,6 +1228,15 @@ define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
 ; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: loadu8f16maskz:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    kmovd %esi, %k1
+; X64-NOVL-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-NOVL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %msk = bitcast i8 %c to <8 x i1>
   %res0 = load <8 x half>, ptr %a, align 8
   %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
@@ -1104,12 +1277,12 @@ declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>)
 declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32,  <8 x i1>, <8 x half>)
 
 define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) {
-; X64-LABEL: storeu8f16mask:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
-; X64-NEXT:    vpmovw2m %xmm0, %k1
-; X64-NEXT:    vmovdqu16 %xmm1, (%rdi) {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: storeu8f16mask:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64VL-NEXT:    vpmovw2m %xmm0, %k1
+; X64VL-NEXT:    vmovdqu16 %xmm1, (%rdi) {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: storeu8f16mask:
 ; X86:       # %bb.0:
@@ -1118,17 +1291,28 @@ define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 %xmm1, (%eax) {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: storeu8f16mask:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; X64-NOVL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vpmovw2m %zmm0, %k0
+; X64-NOVL-NEXT:    kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT:    kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   call void @llvm.masked.store.v8f16.p0(<8 x half> %val, ptr %addr, i32 4, <8 x i1>%mask)
   ret void
 }
 
 define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) {
-; X64-LABEL: maskloadu8f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $15, %xmm1, %xmm1
-; X64-NEXT:    vpmovw2m %xmm1, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskloadu8f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $15, %xmm1, %xmm1
+; X64VL-NEXT:    vpmovw2m %xmm1, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskloadu8f16:
 ; X86:       # %bb.0:
@@ -1137,17 +1321,29 @@ define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskloadu8f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT:    vpsllw $15, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vpmovw2m %zmm1, %k0
+; X64-NOVL-NEXT:    kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT:    kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
   ret <8 x half> %res
 }
 
 define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) {
-; X64-LABEL: maskuloadu8f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
-; X64-NEXT:    vpmovw2m %xmm0, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskuloadu8f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64VL-NEXT:    vpmovw2m %xmm0, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskuloadu8f16:
 ; X86:       # %bb.0:
@@ -1156,17 +1352,28 @@ define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskuloadu8f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vpmovw2m %zmm0, %k0
+; X64-NOVL-NEXT:    kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT:    kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
   ret <8 x half> %res
 }
 
 define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) {
-; X64-LABEL: maskzloadu8f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
-; X64-NEXT:    vpmovw2m %xmm0, %k1
-; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: maskzloadu8f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64VL-NEXT:    vpmovw2m %xmm0, %k1
+; X64VL-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: maskzloadu8f16:
 ; X86:       # %bb.0:
@@ -1175,6 +1382,17 @@ define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: maskzloadu8f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vpmovw2m %zmm0, %k0
+; X64-NOVL-NEXT:    kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT:    kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
   ret <8 x half> %res
 }
@@ -1188,11 +1406,11 @@ define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
 }
 
 define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
-; X64-LABEL: movrrk8f16:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
-; X64-NEXT:    retq
+; X64VL-LABEL: movrrk8f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %edi, %k1
+; X64VL-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: movrrk8f16:
 ; X86:       # %bb.0:
@@ -1200,17 +1418,27 @@ define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
 ; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: movrrk8f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %edi, %k1
+; X64-NOVL-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %mask = bitcast i8 %msk to <8 x i1>
   %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
   ret <8 x half> %res
 }
 
 define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
-; X64-LABEL: movrrkz8f16:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: movrrkz8f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    kmovd %edi, %k1
+; X64VL-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: movrrkz8f16:
 ; X86:       # %bb.0:
@@ -1218,18 +1446,69 @@ define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
 ; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: movrrkz8f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT:    kmovd %edi, %k1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
   %mask = bitcast i8 %msk to <8 x i1>
   %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
   ret <8 x half> %res
 }
 
 define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
-; CHECK-LABEL: movsh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
-; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vaddph %xmm0, %xmm2, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64VL-LABEL: movsh:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; X64VL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT:    vaddph %xmm0, %xmm2, %xmm0
+; X64VL-NEXT:    retq
+;
+; X86-LABEL: movsh:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; X86-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vaddph %xmm0, %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: movsh:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; X64-NOVL-NEXT:    vmovsh %xmm0, %xmm1, %xmm3
+; X64-NOVL-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT:    vaddsh %xmm4, %xmm5, %xmm4
+; X64-NOVL-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[3,3,3,3]
+; X64-NOVL-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[3,3,3,3]
+; X64-NOVL-NEXT:    vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X64-NOVL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; X64-NOVL-NEXT:    vaddsh %xmm5, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vshufpd {{.*#+}} xmm5 = xmm3[1,0]
+; X64-NOVL-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; X64-NOVL-NEXT:    vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; X64-NOVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; X64-NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4
+; X64-NOVL-NEXT:    vpsrlq $48, %xmm2, %xmm5
+; X64-NOVL-NEXT:    vaddsh %xmm4, %xmm5, %xmm4
+; X64-NOVL-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; X64-NOVL-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; X64-NOVL-NEXT:    vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X64-NOVL-NEXT:    vaddsh %xmm3, %xmm2, %xmm3
+; X64-NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vpsrld $16, %xmm2, %xmm2
+; X64-NOVL-NEXT:    vaddsh %xmm1, %xmm2, %xmm1
+; X64-NOVL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X64-NOVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; X64-NOVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NOVL-NEXT:    retq
   %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
   %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %res = fadd <8 x half> %res1, %res2
@@ -1939,14 +2218,14 @@ define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
 }
 
 define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, ptr %4) {
-; X64-LABEL: regression2:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovw (%rsi), %xmm0
-; X64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; X64-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; X64-NEXT:    retq
+; X64VL-LABEL: regression2:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vmovw (%rsi), %xmm0
+; X64VL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64VL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X64VL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X64VL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: regression2:
 ; X86:       # %bb.0:
@@ -1957,6 +2236,16 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
 ; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
 ; X86-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: regression2:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vmovw (%rsi), %xmm0
+; X64-NOVL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X64-NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X64-NOVL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3.92156886E-3,3.92156886E-3,3.92156886E-3,3.92156886E-3]
+; X64-NOVL-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; X64-NOVL-NEXT:    retq
   %6 = load i8, ptr %4, align 1
   %7 = getelementptr i8, ptr %4, i64 1
   %8 = addrspacecast ptr %7 to ptr addrspace(4)
@@ -1973,13 +2262,13 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
 ; Make sure load/stores of v4f16 are handled well on 32-bit targets where
 ; default widening legalization can't use i64.
 define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
-; X64-LABEL: load_store_v4f16:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    vaddph %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vmovlps %xmm0, (%rdx)
-; X64-NEXT:    retq
+; X64VL-LABEL: load_store_v4f16:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64VL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64VL-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; X64VL-NEXT:    vmovlps %xmm0, (%rdx)
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: load_store_v4f16:
 ; X86:       # %bb.0:
@@ -1991,6 +2280,26 @@ define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
 ; X86-NEXT:    vaddph %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovlps %xmm0, (%eax)
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: load_store_v4f16:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NOVL-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm2
+; X64-NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3
+; X64-NOVL-NEXT:    vaddsh %xmm2, %xmm3, %xmm2
+; X64-NOVL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X64-NOVL-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X64-NOVL-NEXT:    vaddsh %xmm3, %xmm4, %xmm3
+; X64-NOVL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NOVL-NEXT:    vaddsh %xmm1, %xmm0, %xmm3
+; X64-NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1
+; X64-NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; X64-NOVL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X64-NOVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NOVL-NEXT:    vmovq %xmm0, (%rdx)
+; X64-NOVL-NEXT:    retq
   %a = load <4 x half>, ptr %x
   %b = load <4 x half>, ptr %y
   %c = fadd <4 x half> %a, %b
@@ -2044,20 +2353,20 @@ define <16 x i16> @test22(ptr %mem) nounwind {
 }
 
 define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind {
-; X64-LABEL: pr52560:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movsbl %dil, %eax
-; X64-NEXT:    vmovw %eax, %xmm1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vpcmpgtw %xmm2, %xmm1, %k1
-; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
-; X64-NEXT:    vmovw %xmm0, %eax
-; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    je .LBB123_2
-; X64-NEXT:  # %bb.1: # %for.body.preheader
-; X64-NEXT:    movb $0, (%rsi)
-; X64-NEXT:  .LBB123_2: # %for.end
-; X64-NEXT:    retq
+; X64VL-LABEL: pr52560:
+; X64VL:       # %bb.0: # %entry
+; X64VL-NEXT:    movsbl %dil, %eax
+; X64VL-NEXT:    vmovw %eax, %xmm1
+; X64VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64VL-NEXT:    vpcmpgtw %xmm2, %xmm1, %k1
+; X64VL-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X64VL-NEXT:    vmovw %xmm0, %eax
+; X64VL-NEXT:    testw %ax, %ax
+; X64VL-NEXT:    je .LBB123_2
+; X64VL-NEXT:  # %bb.1: # %for.body.preheader
+; X64VL-NEXT:    movb $0, (%rsi)
+; X64VL-NEXT:  .LBB123_2: # %for.end
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: pr52560:
 ; X86:       # %bb.0: # %entry
@@ -2074,6 +2383,23 @@ define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind {
 ; X86-NEXT:    movb $0, (%eax)
 ; X86-NEXT:  .LBB123_2: # %for.end
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: pr52560:
+; X64-NOVL:       # %bb.0: # %entry
+; X64-NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT:    movsbl %dil, %eax
+; X64-NOVL-NEXT:    vmovw %eax, %xmm1
+; X64-NOVL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-NOVL-NEXT:    vpcmpgtw %zmm2, %zmm1, %k1
+; X64-NOVL-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT:    vmovw %xmm0, %eax
+; X64-NOVL-NEXT:    testw %ax, %ax
+; X64-NOVL-NEXT:    je .LBB123_2
+; X64-NOVL-NEXT:  # %bb.1: # %for.body.preheader
+; X64-NOVL-NEXT:    movb $0, (%rsi)
+; X64-NOVL-NEXT:  .LBB123_2: # %for.end
+; X64-NOVL-NEXT:    vzeroupper
+; X64-NOVL-NEXT:    retq
 entry:
   %conv = sext i8 %0 to i16
   %2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0
@@ -2092,17 +2418,17 @@ for.end:                                          ; preds = %for.body.preheader,
 }
 
 define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind {
-; X64-LABEL: pr52561:
-; X64:       # %bb.0:
-; X64-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
-; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
-; X64-NEXT:    retq
+; X64VL-LABEL: pr52561:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; X64VL-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X64VL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
+; X64VL-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X64VL-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; X64VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64VL-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: pr52561:
 ; X86:       # %bb.0:
@@ -2121,6 +2447,13 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: pr52561:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; X64-NOVL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; X64-NOVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; X64-NOVL-NEXT:    retq
   %1 = add <16 x i32> %a, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
   %2 = add <16 x i32> %1, %b
   %3 = and <16 x i32> %2, <i32 65535, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 65535>
@@ -2128,13 +2461,13 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 }
 
 define <8 x i16> @pr59628_xmm(i16 %arg) {
-; X64-LABEL: pr59628_xmm:
-; X64:       # %bb.0:
-; X64-NEXT:    vmovw %edi, %xmm0
-; X64-NEXT:    vpbroadcastw %edi, %xmm1
-; X64-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
-; X64-NEXT:    retq
+; X64VL-LABEL: pr59628_xmm:
+; X64VL:       # %bb.0:
+; X64VL-NEXT:    vmovw %edi, %xmm0
+; X64VL-NEXT:    vpbroadcastw %edi, %xmm1
+; X64VL-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; X64VL-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: pr59628_xmm:
 ; X86:       # %bb.0:
@@ -2145,6 +2478,13 @@ define <8 x i16> @pr59628_xmm(i16 %arg) {
 ; X86-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1
 ; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
+;
+; X64-NOVL-LABEL: pr59628_xmm:
+; X64-NOVL:       # %bb.0:
+; X64-NOVL-NEXT:    vmovw %edi, %xmm0
+; X64-NOVL-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; X64-NOVL-NEXT:    vpandn %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT:    retq
   %I1 = insertelement <8 x i16> zeroinitializer, i16 %arg, i16 0
   %I2 = insertelement <8 x i16> %I1, i16 0, i16 %arg
   ret <8 x i16> %I2



More information about the llvm-commits mailing list