[llvm] [X86][FP16] Customize MLOAD/MSTROE(vXf16) if VLX is not enabled (PR #142331)
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 1 22:22:55 PDT 2025
https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/142331
Fixes: https://godbolt.org/z/fa4z97xsY
>From dedbbc78a2fdca213833272b9ce77ed5d46166ba Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Mon, 2 Jun 2025 13:17:22 +0800
Subject: [PATCH] [X86][FP16] Customize MLOAD/MSTROE(vXf16) if VLX is not
enabled
Fixes: https://godbolt.org/z/fa4z97xsY
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +-
llvm/test/CodeGen/X86/avx512fp16-mov.ll | 736 +++++++++++++++++-------
2 files changed, 545 insertions(+), 204 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b1a3e3c006bb3..2beb697548553 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2238,7 +2238,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
- for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+ for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
+ MVT::v16f16, MVT::v8f16}) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
}
@@ -33192,8 +33193,8 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
"Cannot lower masked load op.");
assert((ScalarVT.getSizeInBits() >= 32 ||
- (Subtarget.hasBWI() &&
- (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+ (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
+ ScalarVT == MVT::f16))) &&
"Unsupported masked load op.");
// This operation is legal for targets with VLX, but without
@@ -33240,9 +33241,9 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
"Cannot lower masked store op.");
assert((ScalarVT.getSizeInBits() >= 32 ||
- (Subtarget.hasBWI() &&
- (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
- "Unsupported masked store op.");
+ (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
+ ScalarVT == MVT::f16))) &&
+ "Unsupported masked store op.");
// This operation is legal for targets with VLX, but without
// VLX the vector should be widened to 512 bit
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 82efaffe4014b..526511c850451 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,X64VL
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64,X64-NOVL
define <8 x half> @broadcastph128(ptr %x) {
; X64-LABEL: broadcastph128:
@@ -314,31 +315,47 @@ define <8 x half> @test14(half %x) {
}
define <16 x half> @test14b(half %x) {
-; X64-LABEL: test14b:
-; X64: # %bb.0:
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
-; X64-NEXT: retq
+; X64VL-LABEL: test14b:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64VL-NEXT: vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT: retq
;
; X86-LABEL: test14b:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: test14b:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT: vmovsh %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-NOVL-NEXT: retq
%res = insertelement <16 x half>zeroinitializer, half %x, i32 0
ret <16 x half>%res
}
define <32 x half> @test14c(half %x) {
-; X64-LABEL: test14c:
-; X64: # %bb.0:
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
-; X64-NEXT: retq
+; X64VL-LABEL: test14c:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64VL-NEXT: vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT: retq
;
; X86-LABEL: test14c:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: test14c:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT: vmovsh %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; X64-NOVL-NEXT: retq
%res = insertelement <32 x half>zeroinitializer, half %x, i32 0
ret <32 x half>%res
}
@@ -578,13 +595,13 @@ declare void @llvm.masked.store.v32f16.p0(<32 x half>, ptr, i32, <32 x i1>)
declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>)
define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) {
-; X64-LABEL: storeu32f16mask:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %ymm0, %ymm0
-; X64-NEXT: vpmovb2m %ymm0, %k1
-; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64VL-LABEL: storeu32f16mask:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %ymm0, %ymm0
+; X64VL-NEXT: vpmovb2m %ymm0, %k1
+; X64VL-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64VL-NEXT: vzeroupper
+; X64VL-NEXT: retq
;
; X86-LABEL: storeu32f16mask:
; X86: # %bb.0:
@@ -594,17 +611,25 @@ define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) {
; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: storeu32f16mask:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $7, %ymm0, %ymm0
+; X64-NOVL-NEXT: vpmovb2m %zmm0, %k1
+; X64-NOVL-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
call void @llvm.masked.store.v32f16.p0(<32 x half> %val, ptr %addr, i32 4, <32 x i1>%mask)
ret void
}
define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask) {
-; X64-LABEL: maskloadu32f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %ymm1, %ymm1
-; X64-NEXT: vpmovb2m %ymm1, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: maskloadu32f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %ymm1, %ymm1
+; X64VL-NEXT: vpmovb2m %ymm1, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskloadu32f16:
; X86: # %bb.0:
@@ -613,17 +638,24 @@ define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask)
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskloadu32f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $7, %ymm1, %ymm1
+; X64-NOVL-NEXT: vpmovb2m %zmm1, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NOVL-NEXT: retq
%res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
ret <32 x half> %res
}
define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) {
-; X64-LABEL: maskuloadu32f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %ymm0, %ymm0
-; X64-NEXT: vpmovb2m %ymm0, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: maskuloadu32f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %ymm0, %ymm0
+; X64VL-NEXT: vpmovb2m %ymm0, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskuloadu32f16:
; X86: # %bb.0:
@@ -632,17 +664,24 @@ define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskuloadu32f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $7, %ymm0, %ymm0
+; X64-NOVL-NEXT: vpmovb2m %zmm0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: retq
%res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
ret <32 x half> %res
}
define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) {
-; X64-LABEL: maskzloadu32f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %ymm0, %ymm0
-; X64-NEXT: vpmovb2m %ymm0, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: maskzloadu32f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %ymm0, %ymm0
+; X64VL-NEXT: vpmovb2m %ymm0, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskzloadu32f16:
; X86: # %bb.0:
@@ -651,6 +690,13 @@ define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskzloadu32f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $7, %ymm0, %ymm0
+; X64-NOVL-NEXT: vpmovb2m %zmm0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: retq
%res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
ret <32 x half> %res
}
@@ -713,11 +759,11 @@ define <16 x half> @load16f16(ptr %a) {
}
define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
-; X64-LABEL: load16f16mask:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: load16f16mask:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: load16f16mask:
; X86: # %bb.0:
@@ -725,6 +771,15 @@ define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: load16f16mask:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqa (%rdi), %ymm1
+; X64-NOVL-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, ptr %a
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
@@ -732,11 +787,11 @@ define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
}
define <16 x half> @load16f16maskz(ptr %a, i16 %c) {
-; X64-LABEL: load16f16maskz:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: load16f16maskz:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: load16f16maskz:
; X86: # %bb.0:
@@ -744,6 +799,14 @@ define <16 x half> @load16f16maskz(ptr %a, i16 %c) {
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: load16f16maskz:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-NOVL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, ptr %a
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
@@ -766,11 +829,11 @@ define <16 x half> @loadu16f16(ptr %a) {
}
define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
-; X64-LABEL: loadu16f16mask:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: loadu16f16mask:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: loadu16f16mask:
; X86: # %bb.0:
@@ -778,6 +841,15 @@ define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: loadu16f16mask:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqu (%rdi), %ymm1
+; X64-NOVL-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, ptr %a, align 8
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
@@ -785,11 +857,11 @@ define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
}
define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) {
-; X64-LABEL: loadu16f16maskz:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: loadu16f16maskz:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: loadu16f16maskz:
; X86: # %bb.0:
@@ -797,6 +869,14 @@ define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) {
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: loadu16f16maskz:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqu (%rdi), %ymm0
+; X64-NOVL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, ptr %a, align 8
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
@@ -841,13 +921,13 @@ declare void @llvm.masked.store.v16f16.p0(<16 x half>, ptr, i32, <16 x i1>)
declare <16 x half> @llvm.masked.load.v16f16.p0(ptr, i32, <16 x i1>, <16 x half>)
define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) {
-; X64-LABEL: storeu16f16mask:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %xmm0, %xmm0
-; X64-NEXT: vpmovb2m %xmm0, %k1
-; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64VL-LABEL: storeu16f16mask:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; X64VL-NEXT: vpmovb2m %xmm0, %k1
+; X64VL-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
+; X64VL-NEXT: vzeroupper
+; X64VL-NEXT: retq
;
; X86-LABEL: storeu16f16mask:
; X86: # %bb.0:
@@ -857,17 +937,27 @@ define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) {
; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: storeu16f16mask:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; X64-NOVL-NEXT: vpsllw $7, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpmovb2m %zmm0, %k0
+; X64-NOVL-NEXT: kmovw %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
call void @llvm.masked.store.v16f16.p0(<16 x half> %val, ptr %addr, i32 4, <16 x i1>%mask)
ret void
}
define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask) {
-; X64-LABEL: maskloadu16f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %xmm1, %xmm1
-; X64-NEXT: vpmovb2m %xmm1, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: maskloadu16f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %xmm1, %xmm1
+; X64VL-NEXT: vpmovb2m %xmm1, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskloadu16f16:
; X86: # %bb.0:
@@ -876,17 +966,27 @@ define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask)
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskloadu16f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT: vpsllw $7, %xmm1, %xmm1
+; X64-NOVL-NEXT: vpmovb2m %zmm1, %k0
+; X64-NOVL-NEXT: kmovw %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
ret <16 x half> %res
}
define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) {
-; X64-LABEL: maskuloadu16f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %xmm0, %xmm0
-; X64-NEXT: vpmovb2m %xmm0, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: maskuloadu16f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; X64VL-NEXT: vpmovb2m %xmm0, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskuloadu16f16:
; X86: # %bb.0:
@@ -895,17 +995,26 @@ define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskuloadu16f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $7, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpmovb2m %zmm0, %k0
+; X64-NOVL-NEXT: kmovw %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
ret <16 x half> %res
}
define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) {
-; X64-LABEL: maskzloadu16f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $7, %xmm0, %xmm0
-; X64-NEXT: vpmovb2m %xmm0, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: maskzloadu16f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; X64VL-NEXT: vpmovb2m %xmm0, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskzloadu16f16:
; X86: # %bb.0:
@@ -914,6 +1023,15 @@ define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskzloadu16f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $7, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpmovb2m %zmm0, %k0
+; X64-NOVL-NEXT: kmovw %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
ret <16 x half> %res
}
@@ -927,34 +1045,51 @@ define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
}
define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
-; X64-LABEL: movrrk16f16:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: movrrk16f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %edi, %k1
+; X64VL-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: movrrk16f16:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: movrrk16f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %edi, %k1
+; X64-NOVL-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%mask = bitcast i16 %msk to <16 x i1>
%res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
ret <16 x half> %res
}
define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
-; X64-LABEL: movrrkz16f16:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: movrrkz16f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %edi, %k1
+; X64VL-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: movrrkz16f16:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: movrrkz16f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %edi, %k1
+; X64-NOVL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-NOVL-NEXT: retq
%mask = bitcast i16 %msk to <16 x i1>
%res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
ret <16 x half> %res
@@ -976,11 +1111,11 @@ define <8 x half> @load8f16(ptr %a) {
}
define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
-; X64-LABEL: load8f16mask:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: load8f16mask:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: load8f16mask:
; X86: # %bb.0:
@@ -989,6 +1124,16 @@ define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
; X86-NEXT: kmovd %ecx, %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: load8f16mask:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqa (%rdi), %xmm1
+; X64-NOVL-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, ptr %a
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
@@ -996,11 +1141,11 @@ define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
}
define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
-; X64-LABEL: load8f16maskz:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: load8f16maskz:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: load8f16maskz:
; X86: # %bb.0:
@@ -1009,6 +1154,15 @@ define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
; X86-NEXT: kmovd %ecx, %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: load8f16maskz:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NOVL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, ptr %a
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
@@ -1031,11 +1185,11 @@ define <8 x half> @loadu8f16(ptr %a) {
}
define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
-; X64-LABEL: loadu8f16mask:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: loadu8f16mask:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: loadu8f16mask:
; X86: # %bb.0:
@@ -1044,6 +1198,16 @@ define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
; X86-NEXT: kmovd %ecx, %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: loadu8f16mask:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqu (%rdi), %xmm1
+; X64-NOVL-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, ptr %a, align 8
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
@@ -1051,11 +1215,11 @@ define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
}
define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
-; X64-LABEL: loadu8f16maskz:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %esi, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: loadu8f16maskz:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %esi, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: loadu8f16maskz:
; X86: # %bb.0:
@@ -1064,6 +1228,15 @@ define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
; X86-NEXT: kmovd %ecx, %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: loadu8f16maskz:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: kmovd %esi, %k1
+; X64-NOVL-NEXT: vmovdqu (%rdi), %xmm0
+; X64-NOVL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, ptr %a, align 8
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
@@ -1104,12 +1277,12 @@ declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>)
declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32, <8 x i1>, <8 x half>)
define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) {
-; X64-LABEL: storeu8f16mask:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $15, %xmm0, %xmm0
-; X64-NEXT: vpmovw2m %xmm0, %k1
-; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: storeu8f16mask:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $15, %xmm0, %xmm0
+; X64VL-NEXT: vpmovw2m %xmm0, %k1
+; X64VL-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: storeu8f16mask:
; X86: # %bb.0:
@@ -1118,17 +1291,28 @@ define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: storeu8f16mask:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; X64-NOVL-NEXT: vpsllw $15, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpmovw2m %zmm0, %k0
+; X64-NOVL-NEXT: kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT: kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
call void @llvm.masked.store.v8f16.p0(<8 x half> %val, ptr %addr, i32 4, <8 x i1>%mask)
ret void
}
define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) {
-; X64-LABEL: maskloadu8f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $15, %xmm1, %xmm1
-; X64-NEXT: vpmovw2m %xmm1, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: maskloadu8f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $15, %xmm1, %xmm1
+; X64VL-NEXT: vpmovw2m %xmm1, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskloadu8f16:
; X86: # %bb.0:
@@ -1137,17 +1321,29 @@ define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskloadu8f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT: vpsllw $15, %xmm1, %xmm1
+; X64-NOVL-NEXT: vpmovw2m %zmm1, %k0
+; X64-NOVL-NEXT: kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT: kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
ret <8 x half> %res
}
define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) {
-; X64-LABEL: maskuloadu8f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $15, %xmm0, %xmm0
-; X64-NEXT: vpmovw2m %xmm0, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: maskuloadu8f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $15, %xmm0, %xmm0
+; X64VL-NEXT: vpmovw2m %xmm0, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskuloadu8f16:
; X86: # %bb.0:
@@ -1156,17 +1352,28 @@ define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskuloadu8f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $15, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpmovw2m %zmm0, %k0
+; X64-NOVL-NEXT: kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT: kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
ret <8 x half> %res
}
define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) {
-; X64-LABEL: maskzloadu8f16:
-; X64: # %bb.0:
-; X64-NEXT: vpsllw $15, %xmm0, %xmm0
-; X64-NEXT: vpmovw2m %xmm0, %k1
-; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: maskzloadu8f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpsllw $15, %xmm0, %xmm0
+; X64VL-NEXT: vpmovw2m %xmm0, %k1
+; X64VL-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: maskzloadu8f16:
; X86: # %bb.0:
@@ -1175,6 +1382,17 @@ define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: maskzloadu8f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpsllw $15, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpmovw2m %zmm0, %k0
+; X64-NOVL-NEXT: kshiftld $24, %k0, %k0
+; X64-NOVL-NEXT: kshiftrd $24, %k0, %k1
+; X64-NOVL-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
ret <8 x half> %res
}
@@ -1188,11 +1406,11 @@ define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
}
define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
-; X64-LABEL: movrrk8f16:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
-; X64-NEXT: retq
+; X64VL-LABEL: movrrk8f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %edi, %k1
+; X64VL-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64VL-NEXT: retq
;
; X86-LABEL: movrrk8f16:
; X86: # %bb.0:
@@ -1200,17 +1418,27 @@ define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: movrrk8f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %edi, %k1
+; X64-NOVL-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%mask = bitcast i8 %msk to <8 x i1>
%res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
ret <8 x half> %res
}
define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
-; X64-LABEL: movrrkz8f16:
-; X64: # %bb.0:
-; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: movrrkz8f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: kmovd %edi, %k1
+; X64VL-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: movrrkz8f16:
; X86: # %bb.0:
@@ -1218,18 +1446,69 @@ define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: movrrkz8f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT: kmovd %edi, %k1
+; X64-NOVL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
%mask = bitcast i8 %msk to <8 x i1>
%res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
ret <8 x half> %res
}
define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
-; CHECK-LABEL: movsh:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
-; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: ret{{[l|q]}}
+; X64VL-LABEL: movsh:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; X64VL-NEXT: vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT: vaddph %xmm0, %xmm2, %xmm0
+; X64VL-NEXT: retq
+;
+; X86-LABEL: movsh:
+; X86: # %bb.0:
+; X86-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; X86-NEXT: vmovsh %xmm0, %xmm1, %xmm0
+; X86-NEXT: vaddph %xmm0, %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: movsh:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; X64-NOVL-NEXT: vmovsh %xmm0, %xmm1, %xmm3
+; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4
+; X64-NOVL-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,3,3,3]
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,3,3,3]
+; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; X64-NOVL-NEXT: vaddsh %xmm5, %xmm0, %xmm0
+; X64-NOVL-NEXT: vshufpd {{.*#+}} xmm5 = xmm3[1,0]
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm4
+; X64-NOVL-NEXT: vpsrlq $48, %xmm2, %xmm5
+; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4
+; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X64-NOVL-NEXT: vaddsh %xmm3, %xmm2, %xmm3
+; X64-NOVL-NEXT: vpsrld $16, %xmm1, %xmm1
+; X64-NOVL-NEXT: vpsrld $16, %xmm2, %xmm2
+; X64-NOVL-NEXT: vaddsh %xmm1, %xmm2, %xmm1
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; X64-NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NOVL-NEXT: retq
%res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
%res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%res = fadd <8 x half> %res1, %res2
@@ -1939,14 +2218,14 @@ define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
}
define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, ptr %4) {
-; X64-LABEL: regression2:
-; X64: # %bb.0:
-; X64-NEXT: vmovw (%rsi), %xmm0
-; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
-; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; X64-NEXT: retq
+; X64VL-LABEL: regression2:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vmovw (%rsi), %xmm0
+; X64VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64VL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X64VL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; X64VL-NEXT: retq
;
; X86-LABEL: regression2:
; X86: # %bb.0:
@@ -1957,6 +2236,16 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: regression2:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vmovw (%rsi), %xmm0
+; X64-NOVL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-NOVL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X64-NOVL-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.92156886E-3,3.92156886E-3,3.92156886E-3,3.92156886E-3]
+; X64-NOVL-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; X64-NOVL-NEXT: retq
%6 = load i8, ptr %4, align 1
%7 = getelementptr i8, ptr %4, i64 1
%8 = addrspacecast ptr %7 to ptr addrspace(4)
@@ -1973,13 +2262,13 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2,
; Make sure load/stores of v4f16 are handled well on 32-bit targets where
; default widening legalization can't use i64.
define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
-; X64-LABEL: load_store_v4f16:
-; X64: # %bb.0:
-; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0
-; X64-NEXT: vmovlps %xmm0, (%rdx)
-; X64-NEXT: retq
+; X64VL-LABEL: load_store_v4f16:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64VL-NEXT: vaddph %xmm1, %xmm0, %xmm0
+; X64VL-NEXT: vmovlps %xmm0, (%rdx)
+; X64VL-NEXT: retq
;
; X86-LABEL: load_store_v4f16:
; X86: # %bb.0:
@@ -1991,6 +2280,26 @@ define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlps %xmm0, (%eax)
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: load_store_v4f16:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NOVL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm2
+; X64-NOVL-NEXT: vpsrlq $48, %xmm0, %xmm3
+; X64-NOVL-NEXT: vaddsh %xmm2, %xmm3, %xmm2
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X64-NOVL-NEXT: vaddsh %xmm3, %xmm4, %xmm3
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NOVL-NEXT: vaddsh %xmm1, %xmm0, %xmm3
+; X64-NOVL-NEXT: vpsrld $16, %xmm1, %xmm1
+; X64-NOVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; X64-NOVL-NEXT: vaddsh %xmm1, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NOVL-NEXT: vmovq %xmm0, (%rdx)
+; X64-NOVL-NEXT: retq
%a = load <4 x half>, ptr %x
%b = load <4 x half>, ptr %y
%c = fadd <4 x half> %a, %b
@@ -2044,20 +2353,20 @@ define <16 x i16> @test22(ptr %mem) nounwind {
}
define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind {
-; X64-LABEL: pr52560:
-; X64: # %bb.0: # %entry
-; X64-NEXT: movsbl %dil, %eax
-; X64-NEXT: vmovw %eax, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vpcmpgtw %xmm2, %xmm1, %k1
-; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
-; X64-NEXT: vmovw %xmm0, %eax
-; X64-NEXT: testw %ax, %ax
-; X64-NEXT: je .LBB123_2
-; X64-NEXT: # %bb.1: # %for.body.preheader
-; X64-NEXT: movb $0, (%rsi)
-; X64-NEXT: .LBB123_2: # %for.end
-; X64-NEXT: retq
+; X64VL-LABEL: pr52560:
+; X64VL: # %bb.0: # %entry
+; X64VL-NEXT: movsbl %dil, %eax
+; X64VL-NEXT: vmovw %eax, %xmm1
+; X64VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64VL-NEXT: vpcmpgtw %xmm2, %xmm1, %k1
+; X64VL-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X64VL-NEXT: vmovw %xmm0, %eax
+; X64VL-NEXT: testw %ax, %ax
+; X64VL-NEXT: je .LBB123_2
+; X64VL-NEXT: # %bb.1: # %for.body.preheader
+; X64VL-NEXT: movb $0, (%rsi)
+; X64VL-NEXT: .LBB123_2: # %for.end
+; X64VL-NEXT: retq
;
; X86-LABEL: pr52560:
; X86: # %bb.0: # %entry
@@ -2074,6 +2383,23 @@ define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind {
; X86-NEXT: movb $0, (%eax)
; X86-NEXT: .LBB123_2: # %for.end
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: pr52560:
+; X64-NOVL: # %bb.0: # %entry
+; X64-NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; X64-NOVL-NEXT: movsbl %dil, %eax
+; X64-NOVL-NEXT: vmovw %eax, %xmm1
+; X64-NOVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NOVL-NEXT: vpcmpgtw %zmm2, %zmm1, %k1
+; X64-NOVL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NOVL-NEXT: vmovw %xmm0, %eax
+; X64-NOVL-NEXT: testw %ax, %ax
+; X64-NOVL-NEXT: je .LBB123_2
+; X64-NOVL-NEXT: # %bb.1: # %for.body.preheader
+; X64-NOVL-NEXT: movb $0, (%rsi)
+; X64-NOVL-NEXT: .LBB123_2: # %for.end
+; X64-NOVL-NEXT: vzeroupper
+; X64-NOVL-NEXT: retq
entry:
%conv = sext i8 %0 to i16
%2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0
@@ -2092,17 +2418,17 @@ for.end: ; preds = %for.body.preheader,
}
define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind {
-; X64-LABEL: pr52561:
-; X64: # %bb.0:
-; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
-; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vmovsh %xmm0, %xmm2, %xmm0
-; X64-NEXT: retq
+; X64VL-LABEL: pr52561:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; X64VL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; X64VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
+; X64VL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; X64VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; X64VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64VL-NEXT: vmovsh %xmm0, %xmm2, %xmm0
+; X64VL-NEXT: retq
;
; X86-LABEL: pr52561:
; X86: # %bb.0:
@@ -2121,6 +2447,13 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: pr52561:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; X64-NOVL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; X64-NOVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; X64-NOVL-NEXT: retq
%1 = add <16 x i32> %a, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
%2 = add <16 x i32> %1, %b
%3 = and <16 x i32> %2, <i32 65535, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 65535>
@@ -2128,13 +2461,13 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
}
define <8 x i16> @pr59628_xmm(i16 %arg) {
-; X64-LABEL: pr59628_xmm:
-; X64: # %bb.0:
-; X64-NEXT: vmovw %edi, %xmm0
-; X64-NEXT: vpbroadcastw %edi, %xmm1
-; X64-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
-; X64-NEXT: retq
+; X64VL-LABEL: pr59628_xmm:
+; X64VL: # %bb.0:
+; X64VL-NEXT: vmovw %edi, %xmm0
+; X64VL-NEXT: vpbroadcastw %edi, %xmm1
+; X64VL-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; X64VL-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X64VL-NEXT: retq
;
; X86-LABEL: pr59628_xmm:
; X86: # %bb.0:
@@ -2145,6 +2478,13 @@ define <8 x i16> @pr59628_xmm(i16 %arg) {
; X86-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1
; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X86-NEXT: retl
+;
+; X64-NOVL-LABEL: pr59628_xmm:
+; X64-NOVL: # %bb.0:
+; X64-NOVL-NEXT: vmovw %edi, %xmm0
+; X64-NOVL-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; X64-NOVL-NEXT: vpandn %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT: retq
%I1 = insertelement <8 x i16> zeroinitializer, i16 %arg, i16 0
%I2 = insertelement <8 x i16> %I1, i16 0, i16 %arg
ret <8 x i16> %I2
More information about the llvm-commits
mailing list