[llvm] [X86] Shrink width of masked loads/stores (PR #105451)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 21 11:40:22 PDT 2024
https://github.com/goldsteinn updated https://github.com/llvm/llvm-project/pull/105451
>From f88acdee36976d161c3021e818a428cbfd0fb6f8 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Tue, 20 Aug 2024 14:49:43 -0700
Subject: [PATCH 1/2] [X86] Add tests for shrinking width of masked
loads/stores; NFC
---
.../CodeGen/X86/masked-load-store-shrink.ll | 803 ++++++++++++++++++
1 file changed, 803 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/masked-load-store-shrink.ll
diff --git a/llvm/test/CodeGen/X86/masked-load-store-shrink.ll b/llvm/test/CodeGen/X86/masked-load-store-shrink.ll
new file mode 100644
index 00000000000000..a3bbd79b2d326f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/masked-load-store-shrink.ll
@@ -0,0 +1,803 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512f,avx512bw,avx512dq,avx512vl -verify-machineinstrs | FileCheck %s --check-prefixes=X86-AVX512
+
+define <4 x i64> @mload256_to_load128(ptr %p) nounwind {
+; SSE-LABEL: mload256_to_load128:
+; SSE: ## %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: mload256_to_load128:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload256_to_load128:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $15, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload256_to_load128:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $15, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload256_to_load128:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $15, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload256_to_load128:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $15, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
+ %tmp = tail call <8 x float> @llvm.masked.load.v8f32.p0(ptr %p, i32 32, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float poison, float poison, float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+ %r = bitcast <8 x float> %tmp to <4 x i64>
+ ret <4 x i64> %r
+}
+
+define <8 x i64> @mload512_to_load256(ptr %p) nounwind {
+; SSE-LABEL: mload512_to_load256:
+; SSE: ## %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: mload512_to_load256:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
+; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload512_to_load256:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload512_to_load256:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload512_to_load256:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movl $65535, %eax ## imm = 0xFFFF
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload512_to_load256:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl $65535, %ecx ## imm = 0xFFFF
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
+ %tmp = tail call <32 x i16> @llvm.masked.load.v32i16.p0(ptr %p, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <32 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+ %r = bitcast <32 x i16> %tmp to <8 x i64>
+ ret <8 x i64> %r
+}
+
+define <8 x i64> @mload512_to_mload128(ptr %p) nounwind {
+; SSE-LABEL: mload512_to_mload128:
+; SSE: ## %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: mload512_to_mload128:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload512_to_mload128:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $3, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload512_to_mload128:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movw $3, %ax
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload512_to_mload128:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movw $3, %ax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload512_to_mload128:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movw $3, %cx
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps (%eax), %zmm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
+ %tmp = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %p, i32 64, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+ %r = bitcast <16 x float> %tmp to <8 x i64>
+ ret <8 x i64> %r
+}
+
+define <4 x i64> @mload256_to_mload128(ptr %p) nounwind {
+; SSE2-LABEL: mload256_to_mload128:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mload256_to_mload128:
+; SSE42: ## %bb.0:
+; SSE42-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,mem[0],zero
+; SSE42-NEXT: xorps %xmm1, %xmm1
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: mload256_to_mload128:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,0,4294967295,0,0,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload256_to_mload128:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $5, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload256_to_mload128:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $5, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload256_to_mload128:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $5, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload256_to_mload128:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $5, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
+ %tmp = tail call <8 x float> @llvm.masked.load.v8f32.p0(ptr %p, i32 32, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+ %r = bitcast <8 x float> %tmp to <4 x i64>
+ ret <4 x i64> %r
+}
+
+define <8 x i64> @mload512_to_mload256(ptr %p) nounwind {
+; SSE-LABEL: mload512_to_mload256:
+; SSE: ## %bb.0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: mload512_to_mload256:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,4294967295,4294967295,4294967295,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload512_to_mload256:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $28, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload512_to_mload256:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movw $28, %ax
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload512_to_mload256:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movw $28, %ax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload512_to_mload256:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movw $28, %cx
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps (%eax), %zmm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
+ %tmp = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %p, i32 64, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+ %r = bitcast <16 x float> %tmp to <8 x i64>
+ ret <8 x i64> %r
+}
+
+define <8 x i64> @mload512_fail_no_possible_shrink(ptr %p) nounwind {
+; SSE-LABEL: mload512_fail_no_possible_shrink:
+; SSE: ## %bb.0:
+; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: mload512_fail_no_possible_shrink:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps 32(%rdi), %ymm0, %ymm1
+; AVX1OR2-NEXT: vmovaps (%rdi), %ymm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload512_fail_no_possible_shrink:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $511, %ax ## imm = 0x1FF
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload512_fail_no_possible_shrink:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movw $511, %ax ## imm = 0x1FF
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload512_fail_no_possible_shrink:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movw $511, %ax ## imm = 0x1FF
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload512_fail_no_possible_shrink:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movw $511, %cx ## imm = 0x1FF
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps (%eax), %zmm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
+ %tmp = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %p, i32 64, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> <float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+ %r = bitcast <16 x float> %tmp to <8 x i64>
+ ret <8 x i64> %r
+}
+
+define <8 x i64> @mload512_fail_non_zero_passthru(ptr %p, <8 x i64> %v) nounwind {
+; SSE-LABEL: mload512_fail_non_zero_passthru:
+; SSE: ## %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: mload512_fail_non_zero_passthru:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps (%rdi), %ymm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload512_fail_non_zero_passthru:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload512_fail_non_zero_passthru:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movw $255, %ax
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %zmm0 {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload512_fail_non_zero_passthru:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movw $255, %ax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps (%rdi), %zmm0 {%k1}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload512_fail_non_zero_passthru:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movw $255, %cx
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps (%eax), %zmm0 {%k1}
+; X86-AVX512-NEXT: retl
+ %tmp = bitcast <8 x i64> %v to <16 x float>
+ %r = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %p, i32 64, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> %tmp)
+ %2 = bitcast <16 x float> %r to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <2 x i64> @mload128_fail_no_possible_shrink(ptr %p) nounwind {
+; SSE-LABEL: mload128_fail_no_possible_shrink:
+; SSE: ## %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: mload128_fail_no_possible_shrink:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mload128_fail_no_possible_shrink:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $3, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mload128_fail_no_possible_shrink:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $3, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mload128_fail_no_possible_shrink:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $3, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload128_fail_no_possible_shrink:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $3, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
+ %tmp = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 16, <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>)
+ %r = bitcast <4 x float> %tmp to <2 x i64>
+ ret <2 x i64> %r
+}
+
+define void @mstore256_to_store128(ptr %p, <4 x i64> %v) nounwind {
+; SSE2-LABEL: mstore256_to_store128:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: movd %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: movd %xmm1, 4(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT: movd %xmm1, 8(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT: movd %xmm0, 12(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mstore256_to_store128:
+; SSE42: ## %bb.0:
+; SSE42-NEXT: movups %xmm0, (%rdi)
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: mstore256_to_store128:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX1OR2-NEXT: vzeroupper
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mstore256_to_store128:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: movw $15, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mstore256_to_store128:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $15, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mstore256_to_store128:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $15, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore256_to_store128:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $15, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps %ymm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
+ %tmp = bitcast <4 x i64> %v to <8 x float>
+ tail call void @llvm.masked.store.v8f32.p0(<8 x float> %tmp, ptr %p, i32 32, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+define void @mstore512_to_store256(ptr %p, <8 x i64> %v) nounwind {
+; SSE2-LABEL: mstore512_to_store256:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rdi)
+; SSE2-NEXT: pextrw $1, %xmm0, %eax
+; SSE2-NEXT: movw %ax, 2(%rdi)
+; SSE2-NEXT: pextrw $2, %xmm0, %eax
+; SSE2-NEXT: movw %ax, 4(%rdi)
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
+; SSE2-NEXT: movw %ax, 6(%rdi)
+; SSE2-NEXT: pextrw $4, %xmm0, %eax
+; SSE2-NEXT: movw %ax, 8(%rdi)
+; SSE2-NEXT: pextrw $5, %xmm0, %eax
+; SSE2-NEXT: movw %ax, 10(%rdi)
+; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: movw %ax, 12(%rdi)
+; SSE2-NEXT: pextrw $7, %xmm0, %eax
+; SSE2-NEXT: movw %ax, 14(%rdi)
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movw %ax, 16(%rdi)
+; SSE2-NEXT: pextrw $1, %xmm1, %eax
+; SSE2-NEXT: movw %ax, 18(%rdi)
+; SSE2-NEXT: pextrw $2, %xmm1, %eax
+; SSE2-NEXT: movw %ax, 20(%rdi)
+; SSE2-NEXT: pextrw $3, %xmm1, %eax
+; SSE2-NEXT: movw %ax, 22(%rdi)
+; SSE2-NEXT: pextrw $4, %xmm1, %eax
+; SSE2-NEXT: movw %ax, 24(%rdi)
+; SSE2-NEXT: pextrw $5, %xmm1, %eax
+; SSE2-NEXT: movw %ax, 26(%rdi)
+; SSE2-NEXT: pextrw $6, %xmm1, %eax
+; SSE2-NEXT: movw %ax, 28(%rdi)
+; SSE2-NEXT: pextrw $7, %xmm1, %eax
+; SSE2-NEXT: movw %ax, 30(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mstore512_to_store256:
+; SSE42: ## %bb.0:
+; SSE42-NEXT: movups %xmm0, (%rdi)
+; SSE42-NEXT: movups %xmm1, 16(%rdi)
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: mstore512_to_store256:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovups %ymm0, (%rdi)
+; AVX1OR2-NEXT: vzeroupper
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mstore512_to_store256:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: vmovups %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mstore512_to_store256:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vmovups %ymm0, (%rdi)
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mstore512_to_store256:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movl $65535, %eax ## imm = 0xFFFF
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore512_to_store256:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl $65535, %ecx ## imm = 0xFFFF
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovdqu16 %zmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
+ %tmp = bitcast <8 x i64> %v to <32 x i16>
+ tail call void @llvm.masked.store.v32i16.p0(<32 x i16> %tmp, ptr %p, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+define void @mstore512_to_mstore128(ptr %p, <8 x i64> %v) nounwind {
+; SSE2-LABEL: mstore512_to_mstore128:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: movd %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT: movd %xmm0, 4(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mstore512_to_mstore128:
+; SSE42: ## %bb.0:
+; SSE42-NEXT: movss %xmm0, (%rdi)
+; SSE42-NEXT: extractps $1, %xmm0, 4(%rdi)
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: mstore512_to_mstore128:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,0,0,0,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX1OR2-NEXT: vzeroupper
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mstore512_to_mstore128:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $3, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mstore512_to_mstore128:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movw $3, %ax
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mstore512_to_mstore128:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movw $3, %ax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore512_to_mstore128:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movw $3, %cx
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps %zmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
+ %tmp = bitcast <8 x i64> %v to <16 x float>
+ tail call void @llvm.masked.store.v16f32.p0(<16 x float> %tmp, ptr %p, i32 64, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+define void @mstore256_to_mstore128(ptr %p, <4 x i64> %v) nounwind {
+; SSE2-LABEL: mstore256_to_mstore128:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: movd %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: movd %xmm0, 8(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mstore256_to_mstore128:
+; SSE42: ## %bb.0:
+; SSE42-NEXT: movss %xmm0, (%rdi)
+; SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: mstore256_to_mstore128:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,0,4294967295,0,0,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX1OR2-NEXT: vzeroupper
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mstore256_to_mstore128:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: movw $5, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mstore256_to_mstore128:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $5, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mstore256_to_mstore128:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $5, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore256_to_mstore128:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $5, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps %ymm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
+ %tmp = bitcast <4 x i64> %v to <8 x float>
+ tail call void @llvm.masked.store.v8f32.p0(<8 x float> %tmp, ptr %p, i32 32, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+define void @mstore512_to_mstore256(ptr %p, <8 x i64> %v) nounwind {
+; SSE2-LABEL: mstore512_to_mstore256:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT: movd %xmm2, 8(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT: movd %xmm0, 12(%rdi)
+; SSE2-NEXT: movss %xmm1, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mstore512_to_mstore256:
+; SSE42: ## %bb.0:
+; SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
+; SSE42-NEXT: extractps $3, %xmm0, 12(%rdi)
+; SSE42-NEXT: movss %xmm1, 16(%rdi)
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: mstore512_to_mstore256:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,4294967295,4294967295,4294967295,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX1OR2-NEXT: vzeroupper
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mstore512_to_mstore256:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: movw $28, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mstore512_to_mstore256:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movw $28, %ax
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mstore512_to_mstore256:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movw $28, %ax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore512_to_mstore256:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movw $28, %cx
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps %zmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
+ %tmp = bitcast <8 x i64> %v to <16 x float>
+ tail call void @llvm.masked.store.v16f32.p0(<16 x float> %tmp, ptr %p, i32 64, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+define void @mstore256_fail_no_possible_shrink(ptr %p, <4 x i64> %v) nounwind {
+; SSE2-LABEL: mstore256_fail_no_possible_shrink:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: movd %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE2-NEXT: movd %xmm2, 4(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT: movd %xmm2, 8(%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT: movd %xmm0, 12(%rdi)
+; SSE2-NEXT: movss %xmm1, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mstore256_fail_no_possible_shrink:
+; SSE42: ## %bb.0:
+; SSE42-NEXT: movups %xmm0, (%rdi)
+; SSE42-NEXT: movss %xmm1, 16(%rdi)
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: mstore256_fail_no_possible_shrink:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295,4294967295,0,0,0]
+; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX1OR2-NEXT: vzeroupper
+; AVX1OR2-NEXT: retq
+;
+; AVX512F-LABEL: mstore256_fail_no_possible_shrink:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: movw $31, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLDQ-LABEL: mstore256_fail_no_possible_shrink:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: movb $31, %al
+; AVX512VLDQ-NEXT: kmovw %eax, %k1
+; AVX512VLDQ-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: mstore256_fail_no_possible_shrink:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: movb $31, %al
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore256_fail_no_possible_shrink:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $31, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovaps %ymm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
+ %tmp = bitcast <4 x i64> %v to <8 x float>
+ tail call void @llvm.masked.store.v8f32.p0(<8 x float> %tmp, ptr %p, i32 32, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32 immarg, <8 x i1>, <8 x float>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32 immarg, <32 x i1>, <32 x i16>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32 immarg, <16 x i1>, <16 x float>)
+
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
+
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32 immarg, <8 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32 immarg, <32 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32 immarg, <16 x i1>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; AVX1: {{.*}}
+; AVX2: {{.*}}
+; AVX512: {{.*}}
+; AVX512VL: {{.*}}
>From 03cedf6315570e1d403380cddd688deb03198448 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Tue, 20 Aug 2024 14:49:46 -0700
Subject: [PATCH 2/2] [X86] Shrinking width of masked loads/stores
In the best case we can convert a masked load/store to a narrower
normal load/store. I.e `_mm512_maskz_load_ps(p, 0xff)` can be done
with just an normal `ymm` store. As well, if the mask is entirely
encapsulated in a lower sub-vector, we can shrink the load/store i.e
`_mm512_maskz_load_ps(p, 0x1c)` is the same as
`_mm256_maskz_load_ps(p, 0x1c)`.
---
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 5 +
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 18 ++
llvm/lib/Target/X86/X86ISelLowering.cpp | 107 ++++++++-
.../CodeGen/X86/masked-load-store-shrink.ll | 219 ++++++------------
llvm/test/CodeGen/X86/masked_load.ll | 12 +-
.../CodeGen/X86/masked_loadstore_split.ll | 6 +-
llvm/test/CodeGen/X86/masked_store.ll | 8 +-
llvm/test/CodeGen/X86/pr46532.ll | 2 +-
8 files changed, 208 insertions(+), 169 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 88549d9c9a2858..b2151cd32f1088 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1848,6 +1848,11 @@ bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false);
/// Does not permit build vector implicit truncation.
bool isAllOnesOrAllOnesSplat(SDValue V, bool AllowUndefs = false);
+/// Returns the demanded elements from the mask of a masked op (i.e
+/// MSTORE/MLOAD).
+APInt getDemandedEltsForMaskedOp(SDValue Mask, unsigned NumElts,
+ SmallVector<SDValue> *MaskEltsOut = nullptr);
+
/// Return true if \p V is either a integer or FP constant.
inline bool isIntOrFPConstant(SDValue V) {
return isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 18a3b7bce104a7..cb0e098a1e511e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12128,6 +12128,24 @@ bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
return C && C->isAllOnes() && C->getValueSizeInBits(0) == BitWidth;
}
+APInt llvm::getDemandedEltsForMaskedOp(SDValue Mask, unsigned NumElts,
+ SmallVector<SDValue> *MaskEltsOut) {
+ if (!ISD::isBuildVectorOfConstantSDNodes(Mask.getNode()))
+ return APInt::getAllOnes(NumElts);
+ APInt Demanded = APInt::getZero(NumElts);
+ BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(Mask);
+ for (unsigned i = 0; i < MaskBV->getNumOperands(); ++i) {
+ APInt V;
+ if (!sd_match(MaskBV->getOperand(i), m_ConstInt(V)))
+ return APInt::getAllOnes(NumElts);
+ if (V.isNegative())
+ Demanded.setBit(i);
+ if (MaskEltsOut)
+ MaskEltsOut->emplace_back(MaskBV->getOperand(i));
+ }
+ return Demanded;
+}
+
HandleSDNode::~HandleSDNode() {
DropOperands();
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 169c955f0ba89f..b576127702ae23 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -51536,20 +51536,112 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
}
+static bool tryShrinkMaskedOperation(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Mask, EVT OrigVT,
+ SDValue *ValInOut, EVT *NewVTOut,
+ SDValue *NewMaskOut) {
+ // Ensure we have a reasonable input type.
+ // Also ensure input bits is larger then xmm, otherwise its not
+ // profitable to try to shrink.
+ if (!OrigVT.isSimple() ||
+ !(OrigVT.is256BitVector() || OrigVT.is512BitVector()))
+ return false;
+
+ SmallVector<SDValue> OrigMask;
+ APInt DemandedElts = getDemandedEltsForMaskedOp(
+ Mask, OrigVT.getVectorNumElements(), &OrigMask);
+ if (DemandedElts.isAllOnes() || DemandedElts.isZero())
+ return false;
+
+ unsigned OrigNumElts = OrigVT.getVectorNumElements();
+ // Potential TODO: It might be profitable to extra not just use the "lower"
+ // sub-vector.
+ unsigned ReqElts =
+ DemandedElts.getBitWidth() - DemandedElts.countLeadingZeros();
+ // We can't shrink out vector category in a meaningful way.
+ if (ReqElts > OrigNumElts / 2U)
+ return false;
+
+ // At most shrink to xmm.
+ unsigned NewNumElts =
+ std::max(128U / OrigVT.getScalarSizeInBits(), PowerOf2Ceil(ReqElts));
+
+ EVT NewVT =
+ EVT::getVectorVT(*DAG.getContext(), OrigVT.getScalarType(), NewNumElts);
+ if (!NewVT.isSimple())
+ return false;
+
+ // Extract all the value arguments;
+ if (ValInOut && *ValInOut)
+ *ValInOut = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewVT, *ValInOut,
+ DAG.getIntPtrConstant(0, DL));
+ if (NewVTOut)
+ *NewVTOut = NewVT;
+ *NewMaskOut = SDValue();
+ // The mask was just truncating, so don't need it anymore.
+ if (NewNumElts == ReqElts && DemandedElts.isMask())
+ return true;
+
+ // Get smaller mask.
+ EVT NewMaskVT = EVT::getVectorVT(
+ *DAG.getContext(), Mask.getValueType().getScalarType(), NewNumElts);
+ OrigMask.truncate(NewNumElts);
+ *NewMaskOut = DAG.getBuildVector(NewMaskVT, DL, OrigMask);
+ return true;
+}
+
+static bool tryShrinkMaskedOperation(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Mask, EVT OrigVT, EVT *NewVTOut,
+ SDValue *NewMaskOut) {
+ return tryShrinkMaskedOperation(DAG, DL, Mask, OrigVT, nullptr, NewVTOut,
+ NewMaskOut);
+}
+
+static bool tryShrinkMaskedOperation(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Mask, EVT OrigVT,
+ SDValue *ValInOut, SDValue *NewMaskOut) {
+ return tryShrinkMaskedOperation(DAG, DL, Mask, OrigVT, ValInOut, nullptr,
+ NewMaskOut);
+}
+
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ using namespace llvm::SDPatternMatch;
auto *Mld = cast<MaskedLoadSDNode>(N);
+ SDLoc DL(N);
// TODO: Expanding load with constant mask may be optimized as well.
if (Mld->isExpandingLoad())
return SDValue();
+ SDValue Mask = Mld->getMask();
+ EVT VT = Mld->getValueType(0);
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
if (SDValue ScalarLoad =
reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
return ScalarLoad;
+ SDValue NewMask;
+ EVT NewVT;
+ if (sd_match(Mld->getPassThru(), m_Zero()) &&
+ tryShrinkMaskedOperation(DAG, DL, Mask, VT, &NewVT, &NewMask)) {
+ SDValue NewLoad;
+ if (NewMask)
+ NewLoad = DAG.getMaskedLoad(
+ NewVT, DL, Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
+ NewMask, getZeroVector(NewVT.getSimpleVT(), Subtarget, DAG, DL),
+ Mld->getMemoryVT(), Mld->getMemOperand(), Mld->getAddressingMode(),
+ Mld->getExtensionType());
+ else
+ NewLoad = DAG.getLoad(NewVT, DL, Mld->getChain(), Mld->getBasePtr(),
+ Mld->getMemOperand());
+
+ SDValue R = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Mld->getPassThru(),
+ NewLoad, DAG.getIntPtrConstant(0, DL));
+ return DCI.CombineTo(Mld, R, NewLoad.getValue(1), true);
+ }
+
// TODO: Do some AVX512 subsets benefit from this transform?
if (!Subtarget.hasAVX512())
if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
@@ -51558,9 +51650,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
// If the mask value has been legalized to a non-boolean vector, try to
// simplify ops leading up to it. We only demand the MSB of each lane.
- SDValue Mask = Mld->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
- EVT VT = Mld->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
@@ -51622,6 +51712,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isCompressingStore())
return SDValue();
+
+
EVT VT = Mst->getValue().getValueType();
SDLoc dl(Mst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -51651,6 +51743,17 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
SDValue Value = Mst->getValue();
+ SDValue NewMask;
+ if (tryShrinkMaskedOperation(DAG, dl, Mask, VT, &Value, &NewMask)) {
+ if (NewMask)
+ return DAG.getMaskedStore(Mst->getChain(), dl, Value, Mst->getBasePtr(),
+ Mst->getOffset(), NewMask, Mst->getMemoryVT(),
+ Mst->getMemOperand(), Mst->getAddressingMode());
+ return DAG.getStore(Mst->getChain(), SDLoc(N), Value, Mst->getBasePtr(),
+ Mst->getPointerInfo(), Mst->getOriginalAlign(),
+ Mst->getMemOperand()->getFlags());
+ }
+
if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
Mst->getMemoryVT())) {
diff --git a/llvm/test/CodeGen/X86/masked-load-store-shrink.ll b/llvm/test/CodeGen/X86/masked-load-store-shrink.ll
index a3bbd79b2d326f..c1cdef54bc3737 100644
--- a/llvm/test/CodeGen/X86/masked-load-store-shrink.ll
+++ b/llvm/test/CodeGen/X86/masked-load-store-shrink.ll
@@ -15,40 +15,15 @@ define <4 x i64> @mload256_to_load128(ptr %p) nounwind {
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: mload256_to_load128:
-; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
-; AVX1OR2-NEXT: retq
-;
-; AVX512F-LABEL: mload256_to_load128:
-; AVX512F: ## %bb.0:
-; AVX512F-NEXT: movw $15, %ax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VLDQ-LABEL: mload256_to_load128:
-; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: movb $15, %al
-; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
-; AVX512VLDQ-NEXT: retq
-;
-; AVX512VLBW-LABEL: mload256_to_load128:
-; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movb $15, %al
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
-; AVX512VLBW-NEXT: retq
+; AVX-LABEL: mload256_to_load128:
+; AVX: ## %bb.0:
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: retq
;
; X86-AVX512-LABEL: mload256_to_load128:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movb $15, %cl
-; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: vmovaps (%eax), %xmm0
; X86-AVX512-NEXT: retl
%tmp = tail call <8 x float> @llvm.masked.load.v8f32.p0(ptr %p, i32 32, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float poison, float poison, float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
%r = bitcast <8 x float> %tmp to <4 x i64>
@@ -70,29 +45,15 @@ define <8 x i64> @mload512_to_load256(ptr %p) nounwind {
; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: retq
;
-; AVX512F-LABEL: mload512_to_load256:
-; AVX512F: ## %bb.0:
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VLDQ-LABEL: mload512_to_load256:
-; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0
-; AVX512VLDQ-NEXT: retq
-;
-; AVX512VLBW-LABEL: mload512_to_load256:
-; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movl $65535, %eax ## imm = 0xFFFF
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
-; AVX512VLBW-NEXT: retq
+; AVX512-LABEL: mload512_to_load256:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovups (%rdi), %ymm0
+; AVX512-NEXT: retq
;
; X86-AVX512-LABEL: mload512_to_load256:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl $65535, %ecx ## imm = 0xFFFF
-; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
+; X86-AVX512-NEXT: vmovups (%eax), %ymm0
; X86-AVX512-NEXT: retl
%tmp = tail call <32 x i16> @llvm.masked.load.v32i16.p0(ptr %p, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <32 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
%r = bitcast <32 x i16> %tmp to <8 x i64>
@@ -110,8 +71,8 @@ define <8 x i64> @mload512_to_mload128(ptr %p) nounwind {
;
; AVX1OR2-LABEL: mload512_to_mload128:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,0,0,0,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: retq
;
@@ -120,28 +81,29 @@ define <8 x i64> @mload512_to_mload128(ptr %p) nounwind {
; AVX512F-NEXT: movw $3, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovaps %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload512_to_mload128:
; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: movw $3, %ax
+; AVX512VLDQ-NEXT: movb $3, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload512_to_mload128:
; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movw $3, %ax
+; AVX512VLBW-NEXT: movb $3, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: mload512_to_mload128:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movw $3, %cx
+; X86-AVX512-NEXT: movb $3, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps (%eax), %zmm0 {%k1} {z}
+; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 {%k1} {z}
; X86-AVX512-NEXT: retl
%tmp = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %p, i32 64, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
%r = bitcast <16 x float> %tmp to <8 x i64>
@@ -166,8 +128,9 @@ define <4 x i64> @mload256_to_mload128(ptr %p) nounwind {
;
; AVX1OR2-LABEL: mload256_to_mload128:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,0,4294967295,0,0,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
+; AVX1OR2-NEXT: ## xmm0 = mem[0,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload256_to_mload128:
@@ -175,21 +138,21 @@ define <4 x i64> @mload256_to_mload128(ptr %p) nounwind {
; AVX512F-NEXT: movw $5, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: vmovaps %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload256_to_mload128:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $5, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload256_to_mload128:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $5, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: mload256_to_mload128:
@@ -197,7 +160,7 @@ define <4 x i64> @mload256_to_mload128(ptr %p) nounwind {
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movb $5, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 {%k1} {z}
; X86-AVX512-NEXT: retl
%tmp = tail call <8 x float> @llvm.masked.load.v8f32.p0(ptr %p, i32 32, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
%r = bitcast <8 x float> %tmp to <4 x i64>
@@ -226,28 +189,29 @@ define <8 x i64> @mload512_to_mload256(ptr %p) nounwind {
; AVX512F-NEXT: movw $28, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovaps %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload512_to_mload256:
; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: movw $28, %ax
+; AVX512VLDQ-NEXT: movb $28, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload512_to_mload256:
; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movw $28, %ax
+; AVX512VLBW-NEXT: movb $28, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: mload512_to_mload256:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movw $28, %cx
+; X86-AVX512-NEXT: movb $28, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps (%eax), %zmm0 {%k1} {z}
+; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z}
; X86-AVX512-NEXT: retl
%tmp = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %p, i32 64, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
%r = bitcast <16 x float> %tmp to <8 x i64>
@@ -266,7 +230,7 @@ define <8 x i64> @mload512_fail_no_possible_shrink(ptr %p) nounwind {
; AVX1OR2-LABEL: mload512_fail_no_possible_shrink:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps 32(%rdi), %ymm0, %ymm1
+; AVX1OR2-NEXT: vmaskmovps 32(%rdi), %xmm0, %xmm1
; AVX1OR2-NEXT: vmovaps (%rdi), %ymm0
; AVX1OR2-NEXT: retq
;
@@ -413,44 +377,16 @@ define void @mstore256_to_store128(ptr %p, <4 x i64> %v) nounwind {
; SSE42-NEXT: movups %xmm0, (%rdi)
; SSE42-NEXT: retq
;
-; AVX1OR2-LABEL: mstore256_to_store128:
-; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
-;
-; AVX512F-LABEL: mstore256_to_store128:
-; AVX512F: ## %bb.0:
-; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: movw $15, %ax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1}
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VLDQ-LABEL: mstore256_to_store128:
-; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: movb $15, %al
-; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps %ymm0, (%rdi) {%k1}
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
-;
-; AVX512VLBW-LABEL: mstore256_to_store128:
-; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movb $15, %al
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps %ymm0, (%rdi) {%k1}
-; AVX512VLBW-NEXT: vzeroupper
-; AVX512VLBW-NEXT: retq
+; AVX-LABEL: mstore256_to_store128:
+; AVX: ## %bb.0:
+; AVX-NEXT: vmovaps %xmm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
;
; X86-AVX512-LABEL: mstore256_to_store128:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movb $15, %cl
-; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps %ymm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vmovaps %xmm0, (%eax)
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%tmp = bitcast <4 x i64> %v to <8 x float>
@@ -501,38 +437,16 @@ define void @mstore512_to_store256(ptr %p, <8 x i64> %v) nounwind {
; SSE42-NEXT: movups %xmm1, 16(%rdi)
; SSE42-NEXT: retq
;
-; AVX1OR2-LABEL: mstore512_to_store256:
-; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovups %ymm0, (%rdi)
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
-;
-; AVX512F-LABEL: mstore512_to_store256:
-; AVX512F: ## %bb.0:
-; AVX512F-NEXT: vmovups %ymm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VLDQ-LABEL: mstore512_to_store256:
-; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
-;
-; AVX512VLBW-LABEL: mstore512_to_store256:
-; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movl $65535, %eax ## imm = 0xFFFF
-; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
-; AVX512VLBW-NEXT: vzeroupper
-; AVX512VLBW-NEXT: retq
+; AVX-LABEL: mstore512_to_store256:
+; AVX: ## %bb.0:
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
;
; X86-AVX512-LABEL: mstore512_to_store256:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl $65535, %ecx ## imm = 0xFFFF
-; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovdqu16 %zmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vmovups %ymm0, (%eax)
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%tmp = bitcast <8 x i64> %v to <32 x i16>
@@ -556,8 +470,8 @@ define void @mstore512_to_mstore128(ptr %p, <8 x i64> %v) nounwind {
;
; AVX1OR2-LABEL: mstore512_to_mstore128:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,0,0,0,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = [4294967295,4294967295,0,0]
+; AVX1OR2-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
@@ -571,26 +485,26 @@ define void @mstore512_to_mstore128(ptr %p, <8 x i64> %v) nounwind {
;
; AVX512VLDQ-LABEL: mstore512_to_mstore128:
; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: movw $3, %ax
+; AVX512VLDQ-NEXT: movb $3, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vmovaps %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mstore512_to_mstore128:
; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movw $3, %ax
+; AVX512VLBW-NEXT: movb $3, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vmovaps %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: mstore512_to_mstore128:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movw $3, %cx
+; X86-AVX512-NEXT: movb $3, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps %zmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) {%k1}
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%tmp = bitcast <8 x i64> %v to <16 x float>
@@ -614,8 +528,9 @@ define void @mstore256_to_mstore128(ptr %p, <4 x i64> %v) nounwind {
;
; AVX1OR2-LABEL: mstore256_to_mstore128:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,0,4294967295,0,0,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi)
+; AVX1OR2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; AVX1OR2-NEXT: ## xmm1 = mem[0,0]
+; AVX1OR2-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
@@ -632,7 +547,7 @@ define void @mstore256_to_mstore128(ptr %p, <4 x i64> %v) nounwind {
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $5, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vmovaps %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
@@ -640,7 +555,7 @@ define void @mstore256_to_mstore128(ptr %p, <4 x i64> %v) nounwind {
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $5, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps %ymm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vmovaps %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
@@ -649,7 +564,7 @@ define void @mstore256_to_mstore128(ptr %p, <4 x i64> %v) nounwind {
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movb $5, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps %ymm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) {%k1}
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%tmp = bitcast <4 x i64> %v to <8 x float>
@@ -691,26 +606,26 @@ define void @mstore512_to_mstore256(ptr %p, <8 x i64> %v) nounwind {
;
; AVX512VLDQ-LABEL: mstore512_to_mstore256:
; AVX512VLDQ: ## %bb.0:
-; AVX512VLDQ-NEXT: movw $28, %ax
+; AVX512VLDQ-NEXT: movb $28, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: vmovaps %ymm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mstore512_to_mstore256:
; AVX512VLBW: ## %bb.0:
-; AVX512VLBW-NEXT: movw $28, %ax
+; AVX512VLBW-NEXT: movb $28, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: vmovaps %ymm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: mstore512_to_mstore256:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movw $28, %cx
+; X86-AVX512-NEXT: movb $28, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovaps %zmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vmovaps %ymm0, (%eax) {%k1}
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%tmp = bitcast <8 x i64> %v to <16 x float>
@@ -796,8 +711,6 @@ declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32 immarg, <32 x i1>
declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32 immarg, <16 x i1>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
; AVX1: {{.*}}
; AVX2: {{.*}}
-; AVX512: {{.*}}
; AVX512VL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 89459a2d10177d..29793486b100e1 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6576,8 +6576,8 @@ define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) {
;
; AVX1OR2-LABEL: mload_constmask_v8f32_zero:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0]
-; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1OR2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,4294967295,0]
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f32_zero:
@@ -6585,21 +6585,21 @@ define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) {
; AVX512F-NEXT: movw $7, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: vmovaps %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: mload_constmask_v8f32_zero:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $7, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
-; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: mload_constmask_v8f32_zero:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $7, %al
; AVX512VLBW-NEXT: kmovd %eax, %k1
-; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
;
; X86-AVX512-LABEL: mload_constmask_v8f32_zero:
@@ -6607,7 +6607,7 @@ define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) {
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movb $7, %cl
; X86-AVX512-NEXT: kmovd %ecx, %k1
-; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: vmovups (%eax), %xmm0 {%k1} {z}
; X86-AVX512-NEXT: retl
%res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer)
ret <8 x float> %res
diff --git a/llvm/test/CodeGen/X86/masked_loadstore_split.ll b/llvm/test/CodeGen/X86/masked_loadstore_split.ll
index 0e689a597a72ba..625d9ce3b64b04 100644
--- a/llvm/test/CodeGen/X86/masked_loadstore_split.ll
+++ b/llvm/test/CodeGen/X86/masked_loadstore_split.ll
@@ -7,10 +7,10 @@ define void @split_masked_store(ptr %0) {
; CHECK-NEXT: liveins: $rdi
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+ ; CHECK-NEXT: [[V_SET0_:%[0-9]+]]:vr128 = V_SET0
+ ; CHECK-NEXT: VMOVUPDmr [[COPY]], 1, $noreg, 32, $noreg, killed [[V_SET0_]] :: (store (s128) into %ir.0 + 32, align 8)
; CHECK-NEXT: [[AVX_SET0_:%[0-9]+]]:vr256 = AVX_SET0
- ; CHECK-NEXT: [[VMOVAPSYrm:%[0-9]+]]:vr256 = VMOVAPSYrm $rip, 1, $noreg, %const.0, $noreg :: (load (s256) from constant-pool)
- ; CHECK-NEXT: VMASKMOVPDYmr [[COPY]], 1, $noreg, 32, $noreg, killed [[VMOVAPSYrm]], [[AVX_SET0_]] :: (store unknown-size into %ir.0 + 32, align 8)
- ; CHECK-NEXT: VMOVUPDYmr [[COPY]], 1, $noreg, 0, $noreg, [[AVX_SET0_]] :: (store (s256) into %ir.0, align 8)
+ ; CHECK-NEXT: VMOVUPDYmr [[COPY]], 1, $noreg, 0, $noreg, killed [[AVX_SET0_]] :: (store (s256) into %ir.0, align 8)
; CHECK-NEXT: RET 0
entry:
call void @llvm.masked.store.v8f64.p0(<8 x double> zeroinitializer, ptr %0, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>)
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c7ec5e87dcc6bd..dd7694365ec114 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -5010,10 +5010,10 @@ define void @top_bits_unset_stack() nounwind {
;
; AVX1OR2-LABEL: top_bits_unset_stack:
; AVX1OR2: ## %bb.0: ## %entry
-; AVX1OR2-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,0,0]
-; AVX1OR2-NEXT: vmaskmovpd %ymm0, %ymm1, -{{[0-9]+}}(%rsp)
-; AVX1OR2-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp)
+; AVX1OR2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1OR2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1OR2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index cbc677229ede61..265873f0ed6271 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -8,7 +8,7 @@ define void @WhileWithLoopInvariantOperation.21() {
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, 32(%rax)
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
-; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
+; CHECK-NEXT: vmaskmovps %xmm0, %xmm0, (%rax)
while.1.body.preheader:
%0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%1 = getelementptr inbounds i8, ptr %0, i64 32
More information about the llvm-commits
mailing list