[llvm] [X86] Allow EVEX compression for mask registers (PR #171980)
Qihan Cai via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 23 07:18:36 PST 2025
https://github.com/realqhc updated https://github.com/llvm/llvm-project/pull/171980
>From e8b53eb19caffa0d414682bda9f5ba6a7bbdb296 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Fri, 12 Dec 2025 20:43:13 +1100
Subject: [PATCH 01/10] [X86] combineBitcastvxi1 - prefer movmsk for i32
truncate
Trucate from i32 should directly use vmovmskps instead of converting to mask registers.
Fixes #171746
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +-
.../test/CodeGen/X86/masked_gather_scatter.ll | 192 ++++++++----------
.../CodeGen/X86/vector-reduce-xor-bool.ll | 72 ++-----
3 files changed, 107 insertions(+), 168 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e0ad9caa51705..8b3dc97106f11 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45612,14 +45612,15 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
}
}
- // If the input is a truncate from v16i8 or v32i8 go ahead and use a
- // movmskb even with avx512. This will be better than truncating to vXi1 and
- // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
- // vpcmpeqb/vpcmpgtb.
+ // If the input is a truncate from a small vector type (v16i8, v32i8, v64i8,
+ // v4i32, v8i32), prefer using movmsk instructions (vmovmskb, vmovmskps)
+ // even with avx512 instead of converting to vXi1 and using kmov.
bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
(Src.getOperand(0).getValueType() == MVT::v16i8 ||
Src.getOperand(0).getValueType() == MVT::v32i8 ||
- Src.getOperand(0).getValueType() == MVT::v64i8);
+ Src.getOperand(0).getValueType() == MVT::v64i8 ||
+ Src.getOperand(0).getValueType() == MVT::v4i32 ||
+ Src.getOperand(0).getValueType() == MVT::v8i32);
// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
// directly with vpmovmskb/vmovmskps/vmovmskpd.
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 58adbb767ed87..e141a7a5421bc 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -648,29 +648,28 @@ declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2
define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; X64-KNL-LABEL: test15:
; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X64-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; X64-KNL-NEXT: vpmovsxdq %xmm0, %ymm0
; X64-KNL-NEXT: vpsllq $2, %ymm0, %ymm0
-; X64-KNL-NEXT: vmovq %rdi, %xmm1
-; X64-KNL-NEXT: vpbroadcastq %xmm1, %ymm1
-; X64-KNL-NEXT: vpaddq %ymm0, %ymm1, %ymm1
-; X64-KNL-NEXT: kmovw %k0, %eax
+; X64-KNL-NEXT: vmovq %rdi, %xmm2
+; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
+; X64-KNL-NEXT: vpaddq %ymm0, %ymm2, %ymm2
+; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm0
+; X64-KNL-NEXT: vmovmskps %xmm0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: # implicit-def: $xmm0
; X64-KNL-NEXT: je .LBB14_2
; X64-KNL-NEXT: # %bb.1: # %cond.load
-; X64-KNL-NEXT: vmovq %xmm1, %rcx
+; X64-KNL-NEXT: vmovq %xmm2, %rcx
; X64-KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-KNL-NEXT: .LBB14_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB14_4
; X64-KNL-NEXT: # %bb.3: # %cond.load1
-; X64-KNL-NEXT: vpextrq $1, %xmm1, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
; X64-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; X64-KNL-NEXT: .LBB14_4: # %else2
; X64-KNL-NEXT: testb $4, %al
-; X64-KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
; X64-KNL-NEXT: jne .LBB14_5
; X64-KNL-NEXT: # %bb.6: # %else5
; X64-KNL-NEXT: testb $8, %al
@@ -691,12 +690,11 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
;
; X86-KNL-LABEL: test15:
; X86-KNL: # %bb.0:
-; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; X86-KNL-NEXT: kmovw %k0, %eax
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
+; X86-KNL-NEXT: vpaddd %xmm0, %xmm2, %xmm2
+; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm0
+; X86-KNL-NEXT: vmovmskps %xmm0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: # implicit-def: $xmm0
; X86-KNL-NEXT: jne .LBB14_1
@@ -710,27 +708,25 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB14_7
; X86-KNL-NEXT: .LBB14_8: # %else8
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB14_1: # %cond.load
-; X86-KNL-NEXT: vmovd %xmm1, %ecx
+; X86-KNL-NEXT: vmovd %xmm2, %ecx
; X86-KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB14_4
; X86-KNL-NEXT: .LBB14_3: # %cond.load1
-; X86-KNL-NEXT: vpextrd $1, %xmm1, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
; X86-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB14_6
; X86-KNL-NEXT: .LBB14_5: # %cond.load4
-; X86-KNL-NEXT: vpextrd $2, %xmm1, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
; X86-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB14_8
; X86-KNL-NEXT: .LBB14_7: # %cond.load7
-; X86-KNL-NEXT: vpextrd $3, %xmm1, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
; X86-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: test15:
@@ -761,27 +757,26 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
; X64-KNL-LABEL: test16:
; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X64-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; X64-KNL-NEXT: vpmovsxdq %xmm0, %ymm0
; X64-KNL-NEXT: vpsllq $3, %ymm0, %ymm0
-; X64-KNL-NEXT: vmovq %rdi, %xmm1
-; X64-KNL-NEXT: vpbroadcastq %xmm1, %ymm1
-; X64-KNL-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; X64-KNL-NEXT: kmovw %k0, %eax
+; X64-KNL-NEXT: vmovq %rdi, %xmm3
+; X64-KNL-NEXT: vpbroadcastq %xmm3, %ymm3
+; X64-KNL-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; X64-KNL-NEXT: vmovmskps %xmm1, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB15_2
; X64-KNL-NEXT: # %bb.1: # %cond.load
; X64-KNL-NEXT: vmovq %xmm0, %rcx
-; X64-KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; X64-KNL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; X64-KNL-NEXT: .LBB15_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB15_4
; X64-KNL-NEXT: # %bb.3: # %cond.load1
; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx
; X64-KNL-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
-; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; X64-KNL-NEXT: .LBB15_4: # %else2
; X64-KNL-NEXT: testb $4, %al
; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -790,29 +785,28 @@ define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x doub
; X64-KNL-NEXT: testb $8, %al
; X64-KNL-NEXT: jne .LBB15_7
; X64-KNL-NEXT: .LBB15_8: # %else8
-; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0
+; X64-KNL-NEXT: vmovaps %ymm2, %ymm0
; X64-KNL-NEXT: retq
; X64-KNL-NEXT: .LBB15_5: # %cond.load4
; X64-KNL-NEXT: vmovq %xmm0, %rcx
-; X64-KNL-NEXT: vpbroadcastq (%rcx), %ymm1
-; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; X64-KNL-NEXT: vbroadcastsd (%rcx), %ymm1
+; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; X64-KNL-NEXT: testb $8, %al
; X64-KNL-NEXT: je .LBB15_8
; X64-KNL-NEXT: .LBB15_7: # %cond.load7
; X64-KNL-NEXT: vpextrq $1, %xmm0, %rax
-; X64-KNL-NEXT: vpbroadcastq (%rax), %ymm0
-; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0
+; X64-KNL-NEXT: vbroadcastsd (%rax), %ymm0
+; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; X64-KNL-NEXT: vmovaps %ymm2, %ymm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test16:
; X86-KNL: # %bb.0:
-; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; X86-KNL-NEXT: kmovw %k0, %eax
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; X86-KNL-NEXT: vmovmskps %xmm1, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB15_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -825,31 +819,31 @@ define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x doub
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB15_7
; X86-KNL-NEXT: .LBB15_8: # %else8
-; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0
+; X86-KNL-NEXT: vmovaps %ymm2, %ymm0
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB15_1: # %cond.load
; X86-KNL-NEXT: vmovd %xmm0, %ecx
-; X86-KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; X86-KNL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB15_4
; X86-KNL-NEXT: .LBB15_3: # %cond.load1
; X86-KNL-NEXT: vpextrd $1, %xmm0, %ecx
; X86-KNL-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
-; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB15_6
; X86-KNL-NEXT: .LBB15_5: # %cond.load4
; X86-KNL-NEXT: vpextrd $2, %xmm0, %ecx
-; X86-KNL-NEXT: vpbroadcastq (%ecx), %ymm1
-; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; X86-KNL-NEXT: vbroadcastsd (%ecx), %ymm1
+; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB15_8
; X86-KNL-NEXT: .LBB15_7: # %cond.load7
; X86-KNL-NEXT: vpextrd $3, %xmm0, %eax
-; X86-KNL-NEXT: vpbroadcastq (%eax), %ymm0
-; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0
+; X86-KNL-NEXT: vbroadcastsd (%eax), %ymm0
+; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; X86-KNL-NEXT: vmovaps %ymm2, %ymm0
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: test16:
@@ -1004,8 +998,7 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X64-KNL-LABEL: test18:
; X64-KNL: # %bb.0:
; X64-KNL-NEXT: vpslld $31, %xmm2, %xmm2
-; X64-KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
-; X64-KNL-NEXT: kmovw %k0, %eax
+; X64-KNL-NEXT: vmovmskps %xmm2, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB17_2
; X64-KNL-NEXT: # %bb.1: # %cond.store
@@ -1041,8 +1034,7 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X86-KNL-LABEL: test18:
; X86-KNL: # %bb.0:
; X86-KNL-NEXT: vpslld $31, %xmm2, %xmm2
-; X86-KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
-; X86-KNL-NEXT: kmovw %k0, %eax
+; X86-KNL-NEXT: vmovmskps %xmm2, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB17_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -1055,7 +1047,6 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB17_7
; X86-KNL-NEXT: .LBB17_8: # %else6
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB17_1: # %cond.store
; X86-KNL-NEXT: vmovd %xmm1, %ecx
@@ -1075,7 +1066,6 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X86-KNL-NEXT: .LBB17_7: # %cond.store5
; X86-KNL-NEXT: vpextrd $3, %xmm1, %eax
; X86-KNL-NEXT: vextractps $3, %xmm0, (%eax)
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: test18:
@@ -1099,28 +1089,27 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; X64-KNL-LABEL: test19:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vpsllq $3, %ymm2, %ymm2
+; X64-KNL-NEXT: vmovq %rdi, %xmm3
+; X64-KNL-NEXT: vpbroadcastq %xmm3, %ymm3
+; X64-KNL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X64-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; X64-KNL-NEXT: vpsllq $3, %ymm2, %ymm1
-; X64-KNL-NEXT: vmovq %rdi, %xmm2
-; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
-; X64-KNL-NEXT: vpaddq %ymm1, %ymm2, %ymm1
-; X64-KNL-NEXT: kmovw %k0, %eax
+; X64-KNL-NEXT: vmovmskps %xmm1, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB18_2
; X64-KNL-NEXT: # %bb.1: # %cond.store
-; X64-KNL-NEXT: vmovq %xmm1, %rcx
+; X64-KNL-NEXT: vmovq %xmm2, %rcx
; X64-KNL-NEXT: vmovlps %xmm0, (%rcx)
; X64-KNL-NEXT: .LBB18_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB18_4
; X64-KNL-NEXT: # %bb.3: # %cond.store1
-; X64-KNL-NEXT: vpextrq $1, %xmm1, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
; X64-KNL-NEXT: vmovhps %xmm0, (%rcx)
; X64-KNL-NEXT: .LBB18_4: # %else2
; X64-KNL-NEXT: testb $4, %al
; X64-KNL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X64-KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
; X64-KNL-NEXT: jne .LBB18_5
; X64-KNL-NEXT: # %bb.6: # %else4
; X64-KNL-NEXT: testb $8, %al
@@ -1142,23 +1131,22 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; X86-KNL-LABEL: test19:
; X86-KNL: # %bb.0:
; X86-KNL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
+; X86-KNL-NEXT: vpmovqd %zmm2, %ymm2
+; X86-KNL-NEXT: vpslld $3, %xmm2, %xmm2
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; X86-KNL-NEXT: vpmovqd %zmm2, %ymm1
-; X86-KNL-NEXT: vpslld $3, %xmm1, %xmm1
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
-; X86-KNL-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; X86-KNL-NEXT: kmovw %k0, %eax
+; X86-KNL-NEXT: vmovmskps %xmm1, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: je .LBB18_2
; X86-KNL-NEXT: # %bb.1: # %cond.store
-; X86-KNL-NEXT: vmovd %xmm1, %ecx
+; X86-KNL-NEXT: vmovd %xmm2, %ecx
; X86-KNL-NEXT: vmovlps %xmm0, (%ecx)
; X86-KNL-NEXT: .LBB18_2: # %else
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB18_4
; X86-KNL-NEXT: # %bb.3: # %cond.store1
-; X86-KNL-NEXT: vpextrd $1, %xmm1, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
; X86-KNL-NEXT: vmovhps %xmm0, (%ecx)
; X86-KNL-NEXT: .LBB18_4: # %else2
; X86-KNL-NEXT: testb $4, %al
@@ -1171,12 +1159,12 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB18_5: # %cond.store3
-; X86-KNL-NEXT: vpextrd $2, %xmm1, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
; X86-KNL-NEXT: vmovlps %xmm0, (%ecx)
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB18_8
; X86-KNL-NEXT: .LBB18_7: # %cond.store5
-; X86-KNL-NEXT: vpextrd $3, %xmm1, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
; X86-KNL-NEXT: vmovhps %xmm0, (%eax)
; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
@@ -4426,25 +4414,24 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
; X64-KNL-LABEL: splat_ptr_gather:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovq %rdi, %xmm2
+; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
; X64-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; X64-KNL-NEXT: vmovq %rdi, %xmm0
-; X64-KNL-NEXT: vpbroadcastq %xmm0, %ymm0
-; X64-KNL-NEXT: kmovw %k0, %eax
+; X64-KNL-NEXT: vmovmskps %xmm0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB62_2
; X64-KNL-NEXT: # %bb.1: # %cond.load
-; X64-KNL-NEXT: vmovq %xmm0, %rcx
+; X64-KNL-NEXT: vmovq %xmm2, %rcx
; X64-KNL-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1
; X64-KNL-NEXT: .LBB62_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB62_4
; X64-KNL-NEXT: # %bb.3: # %cond.load1
-; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
; X64-KNL-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1
; X64-KNL-NEXT: .LBB62_4: # %else2
; X64-KNL-NEXT: testb $4, %al
-; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm0
; X64-KNL-NEXT: jne .LBB62_5
; X64-KNL-NEXT: # %bb.6: # %else5
; X64-KNL-NEXT: testb $8, %al
@@ -4467,10 +4454,9 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
;
; X86-KNL-LABEL: splat_ptr_gather:
; X86-KNL: # %bb.0:
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
; X86-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X86-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
-; X86-KNL-NEXT: kmovw %k0, %eax
+; X86-KNL-NEXT: vmovmskps %xmm0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB62_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -4484,28 +4470,26 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
; X86-KNL-NEXT: jne .LBB62_7
; X86-KNL-NEXT: .LBB62_8: # %else8
; X86-KNL-NEXT: vmovdqa %xmm1, %xmm0
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB62_1: # %cond.load
-; X86-KNL-NEXT: vmovd %xmm0, %ecx
+; X86-KNL-NEXT: vmovd %xmm2, %ecx
; X86-KNL-NEXT: vpinsrd $0, (%ecx), %xmm1, %xmm1
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB62_4
; X86-KNL-NEXT: .LBB62_3: # %cond.load1
-; X86-KNL-NEXT: vpextrd $1, %xmm0, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
; X86-KNL-NEXT: vpinsrd $1, (%ecx), %xmm1, %xmm1
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB62_6
; X86-KNL-NEXT: .LBB62_5: # %cond.load4
-; X86-KNL-NEXT: vpextrd $2, %xmm0, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
; X86-KNL-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm1
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB62_8
; X86-KNL-NEXT: .LBB62_7: # %cond.load7
-; X86-KNL-NEXT: vpextrd $3, %xmm0, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
; X86-KNL-NEXT: vpinsrd $3, (%eax), %xmm1, %xmm1
; X86-KNL-NEXT: vmovdqa %xmm1, %xmm0
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: splat_ptr_gather:
@@ -4536,25 +4520,24 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x
define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; X64-KNL-LABEL: splat_ptr_scatter:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovq %rdi, %xmm2
+; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
; X64-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; X64-KNL-NEXT: vmovq %rdi, %xmm0
-; X64-KNL-NEXT: vpbroadcastq %xmm0, %ymm0
-; X64-KNL-NEXT: kmovw %k0, %eax
+; X64-KNL-NEXT: vmovmskps %xmm0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB63_2
; X64-KNL-NEXT: # %bb.1: # %cond.store
-; X64-KNL-NEXT: vmovq %xmm0, %rcx
+; X64-KNL-NEXT: vmovq %xmm2, %rcx
; X64-KNL-NEXT: vmovss %xmm1, (%rcx)
; X64-KNL-NEXT: .LBB63_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB63_4
; X64-KNL-NEXT: # %bb.3: # %cond.store1
-; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
; X64-KNL-NEXT: vextractps $1, %xmm1, (%rcx)
; X64-KNL-NEXT: .LBB63_4: # %else2
; X64-KNL-NEXT: testb $4, %al
-; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm0
; X64-KNL-NEXT: jne .LBB63_5
; X64-KNL-NEXT: # %bb.6: # %else4
; X64-KNL-NEXT: testb $8, %al
@@ -4575,10 +4558,9 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
;
; X86-KNL-LABEL: splat_ptr_scatter:
; X86-KNL: # %bb.0:
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
; X86-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X86-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
-; X86-KNL-NEXT: kmovw %k0, %eax
+; X86-KNL-NEXT: vmovmskps %xmm0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB63_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -4591,27 +4573,25 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB63_7
; X86-KNL-NEXT: .LBB63_8: # %else6
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB63_1: # %cond.store
-; X86-KNL-NEXT: vmovd %xmm0, %ecx
+; X86-KNL-NEXT: vmovd %xmm2, %ecx
; X86-KNL-NEXT: vmovss %xmm1, (%ecx)
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB63_4
; X86-KNL-NEXT: .LBB63_3: # %cond.store1
-; X86-KNL-NEXT: vpextrd $1, %xmm0, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
; X86-KNL-NEXT: vextractps $1, %xmm1, (%ecx)
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB63_6
; X86-KNL-NEXT: .LBB63_5: # %cond.store3
-; X86-KNL-NEXT: vpextrd $2, %xmm0, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
; X86-KNL-NEXT: vextractps $2, %xmm1, (%ecx)
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB63_8
; X86-KNL-NEXT: .LBB63_7: # %cond.store5
-; X86-KNL-NEXT: vpextrd $3, %xmm0, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
; X86-KNL-NEXT: vextractps $3, %xmm1, (%eax)
-; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: splat_ptr_scatter:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 6cb43234d713b..0dce8e204f169 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -79,34 +79,13 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) nounwind {
; AVX-NEXT: setnp %al
; AVX-NEXT: retq
;
-; AVX512F-LABEL: trunc_v4i32_v4i1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: testb $15, %al
-; AVX512F-NEXT: setnp %al
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i32_v4i1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: testb $15, %al
-; AVX512BW-NEXT: setnp %al
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i32_v4i1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: testb %al, %al
-; AVX512VL-NEXT: setnp %al
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: trunc_v4i32_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vmovmskps %xmm0, %eax
+; AVX512-NEXT: testb %al, %al
+; AVX512-NEXT: setnp %al
+; AVX512-NEXT: retq
%a = trunc <4 x i32> %0 to <4 x i1>
%b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
ret i1 %b
@@ -307,35 +286,14 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_v8i32_v8i1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: testb %al, %al
-; AVX512F-NEXT: setnp %al
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i32_v8i1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: testb %al, %al
-; AVX512BW-NEXT: setnp %al
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i32_v8i1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: testb %al, %al
-; AVX512VL-NEXT: setnp %al
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: trunc_v8i32_v8i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512-NEXT: vmovmskps %ymm0, %eax
+; AVX512-NEXT: testb %al, %al
+; AVX512-NEXT: setnp %al
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a = trunc <8 x i32> %0 to <8 x i1>
%b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
ret i1 %b
>From 851e376c4290c3db4fdce60a8e344847ed450ff9 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Fri, 19 Dec 2025 16:09:22 +1100
Subject: [PATCH 02/10] Revert "[X86] combineBitcastvxi1 - prefer movmsk for
i32 truncate"
This reverts commit e8b53eb19caffa0d414682bda9f5ba6a7bbdb296.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +-
.../test/CodeGen/X86/masked_gather_scatter.ll | 192 ++++++++++--------
.../CodeGen/X86/vector-reduce-xor-bool.ll | 72 +++++--
3 files changed, 168 insertions(+), 107 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8b3dc97106f11..e0ad9caa51705 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45612,15 +45612,14 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
}
}
- // If the input is a truncate from a small vector type (v16i8, v32i8, v64i8,
- // v4i32, v8i32), prefer using movmsk instructions (vmovmskb, vmovmskps)
- // even with avx512 instead of converting to vXi1 and using kmov.
+ // If the input is a truncate from v16i8 or v32i8 go ahead and use a
+ // movmskb even with avx512. This will be better than truncating to vXi1 and
+ // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
+ // vpcmpeqb/vpcmpgtb.
bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
(Src.getOperand(0).getValueType() == MVT::v16i8 ||
Src.getOperand(0).getValueType() == MVT::v32i8 ||
- Src.getOperand(0).getValueType() == MVT::v64i8 ||
- Src.getOperand(0).getValueType() == MVT::v4i32 ||
- Src.getOperand(0).getValueType() == MVT::v8i32);
+ Src.getOperand(0).getValueType() == MVT::v64i8);
// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
// directly with vpmovmskb/vmovmskps/vmovmskpd.
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index e141a7a5421bc..58adbb767ed87 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -648,28 +648,29 @@ declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2
define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; X64-KNL-LABEL: test15:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; X64-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; X64-KNL-NEXT: vpmovsxdq %xmm0, %ymm0
; X64-KNL-NEXT: vpsllq $2, %ymm0, %ymm0
-; X64-KNL-NEXT: vmovq %rdi, %xmm2
-; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
-; X64-KNL-NEXT: vpaddq %ymm0, %ymm2, %ymm2
-; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm0
-; X64-KNL-NEXT: vmovmskps %xmm0, %eax
+; X64-KNL-NEXT: vmovq %rdi, %xmm1
+; X64-KNL-NEXT: vpbroadcastq %xmm1, %ymm1
+; X64-KNL-NEXT: vpaddq %ymm0, %ymm1, %ymm1
+; X64-KNL-NEXT: kmovw %k0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: # implicit-def: $xmm0
; X64-KNL-NEXT: je .LBB14_2
; X64-KNL-NEXT: # %bb.1: # %cond.load
-; X64-KNL-NEXT: vmovq %xmm2, %rcx
+; X64-KNL-NEXT: vmovq %xmm1, %rcx
; X64-KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-KNL-NEXT: .LBB14_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB14_4
; X64-KNL-NEXT: # %bb.3: # %cond.load1
-; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm1, %rcx
; X64-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; X64-KNL-NEXT: .LBB14_4: # %else2
; X64-KNL-NEXT: testb $4, %al
-; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
+; X64-KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; X64-KNL-NEXT: jne .LBB14_5
; X64-KNL-NEXT: # %bb.6: # %else5
; X64-KNL-NEXT: testb $8, %al
@@ -690,11 +691,12 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
;
; X86-KNL-LABEL: test15:
; X86-KNL: # %bb.0:
+; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
-; X86-KNL-NEXT: vpaddd %xmm0, %xmm2, %xmm2
-; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm0
-; X86-KNL-NEXT: vmovmskps %xmm0, %eax
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
+; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; X86-KNL-NEXT: kmovw %k0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: # implicit-def: $xmm0
; X86-KNL-NEXT: jne .LBB14_1
@@ -708,25 +710,27 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB14_7
; X86-KNL-NEXT: .LBB14_8: # %else8
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB14_1: # %cond.load
-; X86-KNL-NEXT: vmovd %xmm2, %ecx
+; X86-KNL-NEXT: vmovd %xmm1, %ecx
; X86-KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB14_4
; X86-KNL-NEXT: .LBB14_3: # %cond.load1
-; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm1, %ecx
; X86-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB14_6
; X86-KNL-NEXT: .LBB14_5: # %cond.load4
-; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm1, %ecx
; X86-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB14_8
; X86-KNL-NEXT: .LBB14_7: # %cond.load7
-; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm1, %eax
; X86-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: test15:
@@ -757,26 +761,27 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
; X64-KNL-LABEL: test16:
; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; X64-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; X64-KNL-NEXT: vpmovsxdq %xmm0, %ymm0
; X64-KNL-NEXT: vpsllq $3, %ymm0, %ymm0
-; X64-KNL-NEXT: vmovq %rdi, %xmm3
-; X64-KNL-NEXT: vpbroadcastq %xmm3, %ymm3
-; X64-KNL-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X64-KNL-NEXT: vmovmskps %xmm1, %eax
+; X64-KNL-NEXT: vmovq %rdi, %xmm1
+; X64-KNL-NEXT: vpbroadcastq %xmm1, %ymm1
+; X64-KNL-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; X64-KNL-NEXT: kmovw %k0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB15_2
; X64-KNL-NEXT: # %bb.1: # %cond.load
; X64-KNL-NEXT: vmovq %xmm0, %rcx
-; X64-KNL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; X64-KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; X64-KNL-NEXT: .LBB15_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB15_4
; X64-KNL-NEXT: # %bb.3: # %cond.load1
; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx
; X64-KNL-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
-; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; X64-KNL-NEXT: .LBB15_4: # %else2
; X64-KNL-NEXT: testb $4, %al
; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -785,28 +790,29 @@ define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x doub
; X64-KNL-NEXT: testb $8, %al
; X64-KNL-NEXT: jne .LBB15_7
; X64-KNL-NEXT: .LBB15_8: # %else8
-; X64-KNL-NEXT: vmovaps %ymm2, %ymm0
+; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0
; X64-KNL-NEXT: retq
; X64-KNL-NEXT: .LBB15_5: # %cond.load4
; X64-KNL-NEXT: vmovq %xmm0, %rcx
-; X64-KNL-NEXT: vbroadcastsd (%rcx), %ymm1
-; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; X64-KNL-NEXT: vpbroadcastq (%rcx), %ymm1
+; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; X64-KNL-NEXT: testb $8, %al
; X64-KNL-NEXT: je .LBB15_8
; X64-KNL-NEXT: .LBB15_7: # %cond.load7
; X64-KNL-NEXT: vpextrq $1, %xmm0, %rax
-; X64-KNL-NEXT: vbroadcastsd (%rax), %ymm0
-; X64-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; X64-KNL-NEXT: vmovaps %ymm2, %ymm0
+; X64-KNL-NEXT: vpbroadcastq (%rax), %ymm0
+; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test16:
; X86-KNL: # %bb.0:
-; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm3
-; X86-KNL-NEXT: vpaddd %xmm0, %xmm3, %xmm0
; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X86-KNL-NEXT: vmovmskps %xmm1, %eax
+; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
+; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; X86-KNL-NEXT: kmovw %k0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB15_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -819,31 +825,31 @@ define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x doub
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB15_7
; X86-KNL-NEXT: .LBB15_8: # %else8
-; X86-KNL-NEXT: vmovaps %ymm2, %ymm0
+; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB15_1: # %cond.load
; X86-KNL-NEXT: vmovd %xmm0, %ecx
-; X86-KNL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; X86-KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB15_4
; X86-KNL-NEXT: .LBB15_3: # %cond.load1
; X86-KNL-NEXT: vpextrd $1, %xmm0, %ecx
; X86-KNL-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
-; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB15_6
; X86-KNL-NEXT: .LBB15_5: # %cond.load4
; X86-KNL-NEXT: vpextrd $2, %xmm0, %ecx
-; X86-KNL-NEXT: vbroadcastsd (%ecx), %ymm1
-; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; X86-KNL-NEXT: vpbroadcastq (%ecx), %ymm1
+; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB15_8
; X86-KNL-NEXT: .LBB15_7: # %cond.load7
; X86-KNL-NEXT: vpextrd $3, %xmm0, %eax
-; X86-KNL-NEXT: vbroadcastsd (%eax), %ymm0
-; X86-KNL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; X86-KNL-NEXT: vmovaps %ymm2, %ymm0
+; X86-KNL-NEXT: vpbroadcastq (%eax), %ymm0
+; X86-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: test16:
@@ -998,7 +1004,8 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X64-KNL-LABEL: test18:
; X64-KNL: # %bb.0:
; X64-KNL-NEXT: vpslld $31, %xmm2, %xmm2
-; X64-KNL-NEXT: vmovmskps %xmm2, %eax
+; X64-KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; X64-KNL-NEXT: kmovw %k0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB17_2
; X64-KNL-NEXT: # %bb.1: # %cond.store
@@ -1034,7 +1041,8 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X86-KNL-LABEL: test18:
; X86-KNL: # %bb.0:
; X86-KNL-NEXT: vpslld $31, %xmm2, %xmm2
-; X86-KNL-NEXT: vmovmskps %xmm2, %eax
+; X86-KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; X86-KNL-NEXT: kmovw %k0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB17_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -1047,6 +1055,7 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB17_7
; X86-KNL-NEXT: .LBB17_8: # %else6
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB17_1: # %cond.store
; X86-KNL-NEXT: vmovd %xmm1, %ecx
@@ -1066,6 +1075,7 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; X86-KNL-NEXT: .LBB17_7: # %cond.store5
; X86-KNL-NEXT: vpextrd $3, %xmm1, %eax
; X86-KNL-NEXT: vextractps $3, %xmm0, (%eax)
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: test18:
@@ -1089,27 +1099,28 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; X64-KNL-LABEL: test19:
; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vpsllq $3, %ymm2, %ymm2
-; X64-KNL-NEXT: vmovq %rdi, %xmm3
-; X64-KNL-NEXT: vpbroadcastq %xmm3, %ymm3
-; X64-KNL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X64-KNL-NEXT: vmovmskps %xmm1, %eax
+; X64-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; X64-KNL-NEXT: vpsllq $3, %ymm2, %ymm1
+; X64-KNL-NEXT: vmovq %rdi, %xmm2
+; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
+; X64-KNL-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; X64-KNL-NEXT: kmovw %k0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB18_2
; X64-KNL-NEXT: # %bb.1: # %cond.store
-; X64-KNL-NEXT: vmovq %xmm2, %rcx
+; X64-KNL-NEXT: vmovq %xmm1, %rcx
; X64-KNL-NEXT: vmovlps %xmm0, (%rcx)
; X64-KNL-NEXT: .LBB18_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB18_4
; X64-KNL-NEXT: # %bb.3: # %cond.store1
-; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm1, %rcx
; X64-KNL-NEXT: vmovhps %xmm0, (%rcx)
; X64-KNL-NEXT: .LBB18_4: # %else2
; X64-KNL-NEXT: testb $4, %al
; X64-KNL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
+; X64-KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; X64-KNL-NEXT: jne .LBB18_5
; X64-KNL-NEXT: # %bb.6: # %else4
; X64-KNL-NEXT: testb $8, %al
@@ -1131,22 +1142,23 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; X86-KNL-LABEL: test19:
; X86-KNL: # %bb.0:
; X86-KNL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
-; X86-KNL-NEXT: vpmovqd %zmm2, %ymm2
-; X86-KNL-NEXT: vpslld $3, %xmm2, %xmm2
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm3
-; X86-KNL-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; X86-KNL-NEXT: vmovmskps %xmm1, %eax
+; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; X86-KNL-NEXT: vpmovqd %zmm2, %ymm1
+; X86-KNL-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
+; X86-KNL-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; X86-KNL-NEXT: kmovw %k0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: je .LBB18_2
; X86-KNL-NEXT: # %bb.1: # %cond.store
-; X86-KNL-NEXT: vmovd %xmm2, %ecx
+; X86-KNL-NEXT: vmovd %xmm1, %ecx
; X86-KNL-NEXT: vmovlps %xmm0, (%ecx)
; X86-KNL-NEXT: .LBB18_2: # %else
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB18_4
; X86-KNL-NEXT: # %bb.3: # %cond.store1
-; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm1, %ecx
; X86-KNL-NEXT: vmovhps %xmm0, (%ecx)
; X86-KNL-NEXT: .LBB18_4: # %else2
; X86-KNL-NEXT: testb $4, %al
@@ -1159,12 +1171,12 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB18_5: # %cond.store3
-; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm1, %ecx
; X86-KNL-NEXT: vmovlps %xmm0, (%ecx)
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB18_8
; X86-KNL-NEXT: .LBB18_7: # %cond.store5
-; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm1, %eax
; X86-KNL-NEXT: vmovhps %xmm0, (%eax)
; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
@@ -4414,24 +4426,25 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
; X64-KNL-LABEL: splat_ptr_gather:
; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vmovq %rdi, %xmm2
-; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
; X64-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X64-KNL-NEXT: vmovmskps %xmm0, %eax
+; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; X64-KNL-NEXT: vmovq %rdi, %xmm0
+; X64-KNL-NEXT: vpbroadcastq %xmm0, %ymm0
+; X64-KNL-NEXT: kmovw %k0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB62_2
; X64-KNL-NEXT: # %bb.1: # %cond.load
-; X64-KNL-NEXT: vmovq %xmm2, %rcx
+; X64-KNL-NEXT: vmovq %xmm0, %rcx
; X64-KNL-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1
; X64-KNL-NEXT: .LBB62_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB62_4
; X64-KNL-NEXT: # %bb.3: # %cond.load1
-; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx
; X64-KNL-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1
; X64-KNL-NEXT: .LBB62_4: # %else2
; X64-KNL-NEXT: testb $4, %al
-; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm0
+; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; X64-KNL-NEXT: jne .LBB62_5
; X64-KNL-NEXT: # %bb.6: # %else5
; X64-KNL-NEXT: testb $8, %al
@@ -4454,9 +4467,10 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
;
; X86-KNL-LABEL: splat_ptr_gather:
; X86-KNL: # %bb.0:
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
; X86-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X86-KNL-NEXT: vmovmskps %xmm0, %eax
+; X86-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
+; X86-KNL-NEXT: kmovw %k0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB62_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -4470,26 +4484,28 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
; X86-KNL-NEXT: jne .LBB62_7
; X86-KNL-NEXT: .LBB62_8: # %else8
; X86-KNL-NEXT: vmovdqa %xmm1, %xmm0
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB62_1: # %cond.load
-; X86-KNL-NEXT: vmovd %xmm2, %ecx
+; X86-KNL-NEXT: vmovd %xmm0, %ecx
; X86-KNL-NEXT: vpinsrd $0, (%ecx), %xmm1, %xmm1
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB62_4
; X86-KNL-NEXT: .LBB62_3: # %cond.load1
-; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm0, %ecx
; X86-KNL-NEXT: vpinsrd $1, (%ecx), %xmm1, %xmm1
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB62_6
; X86-KNL-NEXT: .LBB62_5: # %cond.load4
-; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm0, %ecx
; X86-KNL-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm1
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB62_8
; X86-KNL-NEXT: .LBB62_7: # %cond.load7
-; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm0, %eax
; X86-KNL-NEXT: vpinsrd $3, (%eax), %xmm1, %xmm1
; X86-KNL-NEXT: vmovdqa %xmm1, %xmm0
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: splat_ptr_gather:
@@ -4520,24 +4536,25 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x
define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; X64-KNL-LABEL: splat_ptr_scatter:
; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vmovq %rdi, %xmm2
-; X64-KNL-NEXT: vpbroadcastq %xmm2, %ymm2
; X64-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X64-KNL-NEXT: vmovmskps %xmm0, %eax
+; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; X64-KNL-NEXT: vmovq %rdi, %xmm0
+; X64-KNL-NEXT: vpbroadcastq %xmm0, %ymm0
+; X64-KNL-NEXT: kmovw %k0, %eax
; X64-KNL-NEXT: testb $1, %al
; X64-KNL-NEXT: je .LBB63_2
; X64-KNL-NEXT: # %bb.1: # %cond.store
-; X64-KNL-NEXT: vmovq %xmm2, %rcx
+; X64-KNL-NEXT: vmovq %xmm0, %rcx
; X64-KNL-NEXT: vmovss %xmm1, (%rcx)
; X64-KNL-NEXT: .LBB63_2: # %else
; X64-KNL-NEXT: testb $2, %al
; X64-KNL-NEXT: je .LBB63_4
; X64-KNL-NEXT: # %bb.3: # %cond.store1
-; X64-KNL-NEXT: vpextrq $1, %xmm2, %rcx
+; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx
; X64-KNL-NEXT: vextractps $1, %xmm1, (%rcx)
; X64-KNL-NEXT: .LBB63_4: # %else2
; X64-KNL-NEXT: testb $4, %al
-; X64-KNL-NEXT: vextracti128 $1, %ymm2, %xmm0
+; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; X64-KNL-NEXT: jne .LBB63_5
; X64-KNL-NEXT: # %bb.6: # %else4
; X64-KNL-NEXT: testb $8, %al
@@ -4558,9 +4575,10 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
;
; X86-KNL-LABEL: splat_ptr_scatter:
; X86-KNL: # %bb.0:
-; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
; X86-KNL-NEXT: vpslld $31, %xmm0, %xmm0
-; X86-KNL-NEXT: vmovmskps %xmm0, %eax
+; X86-KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
+; X86-KNL-NEXT: kmovw %k0, %eax
; X86-KNL-NEXT: testb $1, %al
; X86-KNL-NEXT: jne .LBB63_1
; X86-KNL-NEXT: # %bb.2: # %else
@@ -4573,25 +4591,27 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: jne .LBB63_7
; X86-KNL-NEXT: .LBB63_8: # %else6
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
; X86-KNL-NEXT: .LBB63_1: # %cond.store
-; X86-KNL-NEXT: vmovd %xmm2, %ecx
+; X86-KNL-NEXT: vmovd %xmm0, %ecx
; X86-KNL-NEXT: vmovss %xmm1, (%ecx)
; X86-KNL-NEXT: testb $2, %al
; X86-KNL-NEXT: je .LBB63_4
; X86-KNL-NEXT: .LBB63_3: # %cond.store1
-; X86-KNL-NEXT: vpextrd $1, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $1, %xmm0, %ecx
; X86-KNL-NEXT: vextractps $1, %xmm1, (%ecx)
; X86-KNL-NEXT: testb $4, %al
; X86-KNL-NEXT: je .LBB63_6
; X86-KNL-NEXT: .LBB63_5: # %cond.store3
-; X86-KNL-NEXT: vpextrd $2, %xmm2, %ecx
+; X86-KNL-NEXT: vpextrd $2, %xmm0, %ecx
; X86-KNL-NEXT: vextractps $2, %xmm1, (%ecx)
; X86-KNL-NEXT: testb $8, %al
; X86-KNL-NEXT: je .LBB63_8
; X86-KNL-NEXT: .LBB63_7: # %cond.store5
-; X86-KNL-NEXT: vpextrd $3, %xmm2, %eax
+; X86-KNL-NEXT: vpextrd $3, %xmm0, %eax
; X86-KNL-NEXT: vextractps $3, %xmm1, (%eax)
+; X86-KNL-NEXT: vzeroupper
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: splat_ptr_scatter:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 0dce8e204f169..6cb43234d713b 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -79,13 +79,34 @@ define i1 @trunc_v4i32_v4i1(<4 x i32>) nounwind {
; AVX-NEXT: setnp %al
; AVX-NEXT: retq
;
-; AVX512-LABEL: trunc_v4i32_v4i1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vmovmskps %xmm0, %eax
-; AVX512-NEXT: testb %al, %al
-; AVX512-NEXT: setnp %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc_v4i32_v4i1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb $15, %al
+; AVX512F-NEXT: setnp %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_v4i32_v4i1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: testb $15, %al
+; AVX512BW-NEXT: setnp %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_v4i32_v4i1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512VL-NEXT: kmovd %k0, %eax
+; AVX512VL-NEXT: testb %al, %al
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: retq
%a = trunc <4 x i32> %0 to <4 x i1>
%b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
ret i1 %b
@@ -286,14 +307,35 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: trunc_v8i32_v8i1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $31, %ymm0, %ymm0
-; AVX512-NEXT: vmovmskps %ymm0, %eax
-; AVX512-NEXT: testb %al, %al
-; AVX512-NEXT: setnp %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc_v8i32_v8i1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: setnp %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_v8i32_v8i1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: testb %al, %al
+; AVX512BW-NEXT: setnp %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_v8i32_v8i1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: kmovd %k0, %eax
+; AVX512VL-NEXT: testb %al, %al
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a = trunc <8 x i32> %0 to <8 x i1>
%b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
ret i1 %b
>From 104a04811e49faadbfd4a448911f6ac673de877b Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Fri, 19 Dec 2025 18:22:57 +1100
Subject: [PATCH 03/10] [X86] Compress VPMOV*2M+KMOV chain to VMOVMSK on
AVX-512
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 65 +++++++++++++++++--
llvm/test/CodeGen/X86/avx512-ext.ll | 3 +-
.../CodeGen/X86/avx512-mask-zext-bugfix.ll | 3 +-
.../X86/avx512dqvl-intrinsics-upgrade.ll | 3 +-
llvm/test/CodeGen/X86/masked_compressstore.ll | 3 +-
llvm/test/CodeGen/X86/masked_expandload.ll | 3 +-
llvm/test/CodeGen/X86/masked_load.ll | 3 +-
llvm/test/CodeGen/X86/masked_store.ll | 3 +-
llvm/test/CodeGen/X86/pr33349.ll | 3 +-
llvm/test/CodeGen/X86/pr77459.ll | 3 +-
llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 18 ++---
11 files changed, 75 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 0f55c19c69120..088f00c87a6a6 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -41,6 +41,7 @@
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -176,7 +177,8 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
}
static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
- const X86Subtarget &ST) {
+ const X86Subtarget &ST,
+ SmallVectorImpl<MachineInstr *> &ToErase) {
uint64_t TSFlags = MI.getDesc().TSFlags;
// Check for EVEX instructions only.
@@ -187,6 +189,54 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
return false;
+ // Try to compress VPMOV*2M + KMOV chain patterns:
+ // vpmovd2m %xmm0, %k0 -> vmovmskps %xmm0, %eax
+ // kmovb %k0, %eax (erase this)
+ unsigned Opc = MI.getOpcode();
+ if ((Opc == X86::VPMOVD2MZ128kr || Opc == X86::VPMOVD2MZ256kr) &&
+ !usesExtendedRegister(MI) && MI.getOperand(0).isReg()) {
+
+ Register MaskReg = MI.getOperand(0).getReg();
+ Register SrcVecReg = MI.getOperand(1).getReg();
+
+ // Find the unique KMOV instruction that reads this mask register
+ MachineInstr *KMovMI = nullptr;
+ Register GPRReg;
+ for (MachineInstr &UseMI : MBB) {
+ if (&UseMI == &MI)
+ continue;
+
+ unsigned UseOpc = UseMI.getOpcode();
+ if ((UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
+ UseOpc == X86::KMOVDrk || UseOpc == X86::KMOVQrk) &&
+ UseMI.getOperand(1).isReg() &&
+ UseMI.getOperand(1).getReg() == MaskReg) {
+
+ if (KMovMI)
+ break; // Multiple uses, can't compress
+
+ KMovMI = &UseMI;
+ GPRReg = UseMI.getOperand(0).getReg();
+ }
+ }
+ if (KMovMI) {
+ unsigned MovMskOpc = (Opc == X86::VPMOVD2MZ128kr)
+ ? X86::VMOVMSKPSrr
+ : X86::VMOVMSKPSYrr;
+
+ const MCInstrDesc &MovMskDesc = ST.getInstrInfo()->get(MovMskOpc);
+ MI.setDesc(MovMskDesc);
+ MI.getOperand(0).setReg(GPRReg);
+ MI.getOperand(1).setReg(SrcVecReg);
+ MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+
+ // Record KMOV for deletion
+ ToErase.push_back(KMovMI);
+
+ return true;
+ }
+ }
+
auto IsRedundantNewDataDest = [&](unsigned &Opc) {
// $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
// ->
@@ -222,7 +272,6 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
// For AVX512 cases, EVEX prefix is needed in order to carry this information
// thus preventing the transformation to VEX encoding.
bool IsND = X86II::hasNewDataDest(TSFlags);
- unsigned Opc = MI.getOpcode();
bool IsSetZUCCm = Opc == X86::SETZUCCm;
if (TSFlags & X86II::EVEX_B && !IsND && !IsSetZUCCm)
return false;
@@ -347,9 +396,15 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
- // Traverse the basic block.
- for (MachineInstr &MI : llvm::make_early_inc_range(MBB))
- Changed |= CompressEVEXImpl(MI, MBB, ST);
+ SmallVector<MachineInstr *, 4> ToErase;
+
+ for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+ Changed |= CompressEVEXImpl(MI, MBB, ST, ToErase);
+ }
+
+ for (MachineInstr *MI : ToErase) {
+ MI->eraseFromParent();
+ }
}
LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
return Changed;
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 1a712ffac5b7e..2617e2d12adfd 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -1745,8 +1745,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; AVX512DQNOBW: # %bb.0:
; AVX512DQNOBW-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQNOBW-NEXT: vpslld $31, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vpmovd2m %ymm0, %k0
-; AVX512DQNOBW-NEXT: kmovw %k0, %eax
+; AVX512DQNOBW-NEXT: vmovmskps %ymm0, %eax
; AVX512DQNOBW-NEXT: # kill: def $al killed $al killed $eax
; AVX512DQNOBW-NEXT: vzeroupper
; AVX512DQNOBW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
index 2412e7aefdc2f..8febc3df65c0d 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
@@ -34,11 +34,10 @@ define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,ptr %arraydecay,ptr %f
; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movzwl %ax, %esi
-; CHECK-NEXT: kmovb %k0, %edi
; CHECK-NEXT: callq _check_mask16
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: vmovmskps %xmm0, %edi
; CHECK-NEXT: ## kill: def $k1 killed $k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index f31dafcd68626..a5e08a683e322 100644
--- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -2751,8 +2751,7 @@ declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>)
define i8 at test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovd2m %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0]
-; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vmovmskps %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x50,0xc0]
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 3187bf6448690..5296c9d0f0777 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -3444,8 +3444,7 @@ define void @compressstore_v8i16_v8i16(ptr %base, <8 x i16> %V, <8 x i16> %trigg
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1
-; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0
-; AVX512VLDQ-NEXT: kmovw %k0, %eax
+; AVX512VLDQ-NEXT: vmovmskps %ymm1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB11_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index e81a983c07018..ce8a34db498df 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -3047,8 +3047,7 @@ define <8 x i16> @expandload_v8i16_v8i16(ptr %base, <8 x i16> %src0, <8 x i16> %
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1
-; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0
-; AVX512VLDQ-NEXT: kmovw %k0, %eax
+; AVX512VLDQ-NEXT: vmovmskps %ymm1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB11_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 8c4bab99a5b7b..fa8f34cea4638 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -3008,8 +3008,7 @@ define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %dst
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0
-; AVX512VLDQ-NEXT: kmovw %k0, %eax
+; AVX512VLDQ-NEXT: vmovmskps %ymm0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB21_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c7320275091c6..fbecfcb45f8e7 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -1829,8 +1829,7 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) no
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0
-; AVX512VLDQ-NEXT: kmovw %k0, %eax
+; AVX512VLDQ-NEXT: vmovmskps %ymm0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: jne LBB13_1
; AVX512VLDQ-NEXT: ## %bb.2: ## %else
diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll
index c879cb9867ab2..35c09ca3d460a 100644
--- a/llvm/test/CodeGen/X86/pr33349.ll
+++ b/llvm/test/CodeGen/X86/pr33349.ll
@@ -41,7 +41,7 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-LABEL: test:
; SKX: # %bb.0: # %bb
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vmovmskps %xmm0, %eax
; SKX-NEXT: kshiftrb $2, %k0, %k1
; SKX-NEXT: kmovd %k1, %eax
; SKX-NEXT: testb $1, %al
@@ -49,7 +49,6 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-NEXT: fldz
; SKX-NEXT: fld %st(0)
; SKX-NEXT: fcmovne %st(2), %st
-; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fld %st(1)
; SKX-NEXT: fcmovne %st(3), %st
diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll
index 9c072e6f5e3fc..02311a01d675e 100644
--- a/llvm/test/CodeGen/X86/pr77459.ll
+++ b/llvm/test/CodeGen/X86/pr77459.ll
@@ -100,8 +100,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) {
; AVX512-NEXT: vpmovm2d %k0, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vpmovd2m %ymm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: vmovmskps %ymm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 2b89590a0bb41..9645f7c524cb4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -573,8 +573,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
-; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
-; VL_BW_DQ-NEXT: kmovd %k0, %eax
+; VL_BW_DQ-NEXT: vmovmskps %ymm2, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
@@ -615,8 +614,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
-; VL_BW_DQ-NEXT: kmovd %k0, %eax
+; VL_BW_DQ-NEXT: vmovmskps %ymm0, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
@@ -661,8 +659,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
-; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
-; VL_BW_DQ-NEXT: kmovd %k0, %eax
+; VL_BW_DQ-NEXT: vmovmskps %ymm2, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
@@ -703,8 +700,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
-; VL_BW_DQ-NEXT: kmovd %k0, %eax
+; VL_BW_DQ-NEXT: vmovmskps %ymm0, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
@@ -746,8 +742,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
-; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
-; VL_BW_DQ-NEXT: kmovd %k0, %eax
+; VL_BW_DQ-NEXT: vmovmskps %ymm0, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
@@ -796,8 +791,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2
-; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
-; VL_BW_DQ-NEXT: kmovd %k0, %eax
+; VL_BW_DQ-NEXT: vmovmskps %ymm2, %eax
; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
>From e4d4259beae00bc1bd03a5b5876eeed0b4c012e7 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Fri, 19 Dec 2025 18:28:19 +1100
Subject: [PATCH 04/10] clang-format
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 088f00c87a6a6..886c2026ab624 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -220,9 +220,8 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
}
}
if (KMovMI) {
- unsigned MovMskOpc = (Opc == X86::VPMOVD2MZ128kr)
- ? X86::VMOVMSKPSrr
- : X86::VMOVMSKPSYrr;
+ unsigned MovMskOpc =
+ (Opc == X86::VPMOVD2MZ128kr) ? X86::VMOVMSKPSrr : X86::VMOVMSKPSYrr;
const MCInstrDesc &MovMskDesc = ST.getInstrInfo()->get(MovMskOpc);
MI.setDesc(MovMskDesc);
>From 1865989cdf630ba1fda051e144d1f17d360054f4 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Tue, 23 Dec 2025 22:37:16 +1100
Subject: [PATCH 05/10] fixup miscompilation and organize code into separate
function
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 120 +++++++++++-------
.../CodeGen/X86/avx512-mask-zext-bugfix.ll | 3 +-
llvm/test/CodeGen/X86/pr33349.ll | 3 +-
3 files changed, 76 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 886c2026ab624..a5ec40b125888 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -176,6 +176,72 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
return true;
}
+// Try to compress VPMOV*2M + KMOV chain patterns:
+// vpmovd2m %xmm0, %k0 -> (erase this)
+// kmovb %k0, %eax -> vmovmskps %xmm0, %eax
+static bool tryCompressMultiOpPattern(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const X86Subtarget &ST,
+ SmallVectorImpl<MachineInstr *> &
+ ToErase) {
+ const X86InstrInfo *TII = ST.getInstrInfo();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+ unsigned Opc = MI.getOpcode();
+ if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr)
+ return false;
+
+ Register MaskReg = MI.getOperand(0).getReg();
+ Register SrcVecReg = MI.getOperand(1).getReg();
+
+ unsigned MovMskOpc = (Opc == X86::VPMOVD2MZ128kr)
+ ? X86::VMOVMSKPSrr
+ : X86::VMOVMSKPSYrr;
+
+ MachineInstr *KMovMI = nullptr;
+
+ for (MachineInstr &CurMI : llvm::make_range(
+ std::next(MachineBasicBlock::iterator(MI)), MBB.end())) {
+ if (CurMI.modifiesRegister(MaskReg, TRI)) {
+ if (!KMovMI)
+ return false; // Mask clobbered before use
+ break;
+ }
+
+ if (CurMI.readsRegister(MaskReg, TRI)) {
+ if (KMovMI)
+ return false; // Fail: Mask has MULTIPLE uses
+
+ unsigned UseOpc = CurMI.getOpcode();
+ bool IsKMOV = (UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
+ UseOpc == X86::KMOVDrk || UseOpc == X86::KMOVQrk);
+
+ if (IsKMOV && CurMI.getOperand(1).getReg() == MaskReg) {
+ KMovMI = &CurMI;
+ // continue scanning to ensure
+ // there are no *other* uses of the mask later in the block.
+ } else {
+ return false;
+ }
+ }
+
+ if (!KMovMI && CurMI.modifiesRegister(SrcVecReg, TRI)) {
+ return false; // SrcVecReg modified before it could be used by MOVMSK
+ }
+ }
+
+ if (!KMovMI)
+ return false;
+
+ // Apply the transformation
+ KMovMI->setDesc(TII->get(MovMskOpc));
+ KMovMI->getOperand(1).setReg(SrcVecReg);
+ KMovMI->setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+
+ ToErase.push_back(&MI);
+ return true;
+}
+
static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
const X86Subtarget &ST,
SmallVectorImpl<MachineInstr *> &ToErase) {
@@ -189,54 +255,12 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))
return false;
- // Try to compress VPMOV*2M + KMOV chain patterns:
- // vpmovd2m %xmm0, %k0 -> vmovmskps %xmm0, %eax
- // kmovb %k0, %eax (erase this)
- unsigned Opc = MI.getOpcode();
- if ((Opc == X86::VPMOVD2MZ128kr || Opc == X86::VPMOVD2MZ256kr) &&
- !usesExtendedRegister(MI) && MI.getOperand(0).isReg()) {
-
- Register MaskReg = MI.getOperand(0).getReg();
- Register SrcVecReg = MI.getOperand(1).getReg();
-
- // Find the unique KMOV instruction that reads this mask register
- MachineInstr *KMovMI = nullptr;
- Register GPRReg;
- for (MachineInstr &UseMI : MBB) {
- if (&UseMI == &MI)
- continue;
-
- unsigned UseOpc = UseMI.getOpcode();
- if ((UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
- UseOpc == X86::KMOVDrk || UseOpc == X86::KMOVQrk) &&
- UseMI.getOperand(1).isReg() &&
- UseMI.getOperand(1).getReg() == MaskReg) {
-
- if (KMovMI)
- break; // Multiple uses, can't compress
-
- KMovMI = &UseMI;
- GPRReg = UseMI.getOperand(0).getReg();
- }
- }
- if (KMovMI) {
- unsigned MovMskOpc =
- (Opc == X86::VPMOVD2MZ128kr) ? X86::VMOVMSKPSrr : X86::VMOVMSKPSYrr;
-
- const MCInstrDesc &MovMskDesc = ST.getInstrInfo()->get(MovMskOpc);
- MI.setDesc(MovMskDesc);
- MI.getOperand(0).setReg(GPRReg);
- MI.getOperand(1).setReg(SrcVecReg);
- MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
-
- // Record KMOV for deletion
- ToErase.push_back(KMovMI);
-
- return true;
- }
- }
+ // Specialized VPMOVD2M + KMOV -> MOVMSK fold first.
+ if (tryCompressMultiOpPattern(MI, MBB, ST, ToErase))
+ return true;
- auto IsRedundantNewDataDest = [&](unsigned &Opc) {
+ unsigned Opc = MI.getOpcode();
+ auto IsRedundantNewDataDest = [&](unsigned &OpcRef) {
// $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
// ->
// $rbx = ADD64rr $rbx, $rax
@@ -256,7 +280,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
return false;
// Opcode may change after commute, e.g. SHRD -> SHLD
ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2);
- Opc = MI.getOpcode();
+ OpcRef = MI.getOpcode();
return true;
};
diff --git a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
index 8febc3df65c0d..2412e7aefdc2f 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
@@ -34,10 +34,11 @@ define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,ptr %arraydecay,ptr %f
; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax
; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movzwl %ax, %esi
+; CHECK-NEXT: kmovb %k0, %edi
; CHECK-NEXT: callq _check_mask16
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; CHECK-NEXT: vmovmskps %xmm0, %edi
+; CHECK-NEXT: vpmovd2m %xmm0, %k0
; CHECK-NEXT: ## kill: def $k1 killed $k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll
index 35c09ca3d460a..c879cb9867ab2 100644
--- a/llvm/test/CodeGen/X86/pr33349.ll
+++ b/llvm/test/CodeGen/X86/pr33349.ll
@@ -41,7 +41,7 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-LABEL: test:
; SKX: # %bb.0: # %bb
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vmovmskps %xmm0, %eax
+; SKX-NEXT: vpmovd2m %xmm0, %k0
; SKX-NEXT: kshiftrb $2, %k0, %k1
; SKX-NEXT: kmovd %k1, %eax
; SKX-NEXT: testb $1, %al
@@ -49,6 +49,7 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-NEXT: fldz
; SKX-NEXT: fld %st(0)
; SKX-NEXT: fcmovne %st(2), %st
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fld %st(1)
; SKX-NEXT: fcmovne %st(3), %st
>From d40f4a7b647ffc38ddc379e01146d9271d907be9 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Tue, 23 Dec 2025 23:06:28 +1100
Subject: [PATCH 06/10] add support for vmovmskpd
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 38 +++++++++++++------
.../X86/avx512dqvl-intrinsics-upgrade.ll | 6 +--
.../test/CodeGen/X86/masked_gather_scatter.ll | 33 ++++++----------
.../X86/masked_gather_scatter_widen.ll | 6 +--
4 files changed, 41 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index a5ec40b125888..379987d162611 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -16,6 +16,7 @@
// d. NF_ND (EVEX) -> NF (EVEX)
// e. NonNF (EVEX) -> NF (EVEX)
// f. SETZUCCm (EVEX) -> SETCCm (legacy)
+// g. VPMOV*2M + KMOV (EVEX) -> VMOVMSK (VEX)
//
// Compression a, b and c can always reduce code size, with some exceptions
// such as promoted 16-bit CRC32 which is as long as the legacy version.
@@ -177,26 +178,39 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
}
// Try to compress VPMOV*2M + KMOV chain patterns:
-// vpmovd2m %xmm0, %k0 -> (erase this)
-// kmovb %k0, %eax -> vmovmskps %xmm0, %eax
-static bool tryCompressMultiOpPattern(MachineInstr &MI,
- MachineBasicBlock &MBB,
- const X86Subtarget &ST,
- SmallVectorImpl<MachineInstr *> &
- ToErase) {
+// vpmov*2m %xmm0, %k0 -> (erase this)
+// kmov* %k0, %eax -> vmovmskp* %xmm0, %eax
+static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB,
+ const X86Subtarget &ST,
+ SmallVectorImpl<MachineInstr *> &ToErase) {
const X86InstrInfo *TII = ST.getInstrInfo();
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
unsigned Opc = MI.getOpcode();
- if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr)
+ if (Opc != X86::VPMOVD2MZ128kr && Opc != X86::VPMOVD2MZ256kr &&
+ Opc != X86::VPMOVQ2MZ128kr && Opc != X86::VPMOVQ2MZ256kr)
return false;
Register MaskReg = MI.getOperand(0).getReg();
Register SrcVecReg = MI.getOperand(1).getReg();
- unsigned MovMskOpc = (Opc == X86::VPMOVD2MZ128kr)
- ? X86::VMOVMSKPSrr
- : X86::VMOVMSKPSYrr;
+ unsigned MovMskOpc = 0;
+ switch (Opc) {
+ case X86::VPMOVD2MZ128kr:
+ MovMskOpc = X86::VMOVMSKPSrr;
+ break;
+ case X86::VPMOVD2MZ256kr:
+ MovMskOpc = X86::VMOVMSKPSYrr;
+ break;
+ case X86::VPMOVQ2MZ128kr:
+ MovMskOpc = X86::VMOVMSKPDrr;
+ break;
+ case X86::VPMOVQ2MZ256kr:
+ MovMskOpc = X86::VMOVMSKPDYrr;
+ break;
+ default:
+ llvm_unreachable("Unknown VPMOV opcode");
+ }
MachineInstr *KMovMI = nullptr;
@@ -256,7 +270,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
return false;
// Specialized VPMOVD2M + KMOV -> MOVMSK fold first.
- if (tryCompressMultiOpPattern(MI, MBB, ST, ToErase))
+ if (tryCompressVPMOVPattern(MI, MBB, ST, ToErase))
return true;
unsigned Opc = MI.getOpcode();
diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index a5e08a683e322..6f3be88d7cd0c 100644
--- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -2776,8 +2776,7 @@ declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>)
define i8 at test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovq2m %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0]
-; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vmovmskpd %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x50,0xc0]
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
@@ -2789,8 +2788,7 @@ declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>)
define i8 at test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovq2m %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0]
-; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vmovmskpd %ymm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x50,0xc0]
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 58adbb767ed87..cf49ac1e4886b 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -966,10 +966,9 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
; X86-SKX-LABEL: test17:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; X86-SKX-NEXT: vpmovq2m %xmm1, %k0
; X86-SKX-NEXT: vpslld $3, %xmm0, %xmm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm1, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB16_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -1255,8 +1254,7 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
; X64-SKX-LABEL: test20:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; X64-SKX-NEXT: vpmovq2m %xmm2, %k0
-; X64-SKX-NEXT: kmovw %k0, %eax
+; X64-SKX-NEXT: vmovmskpd %xmm2, %eax
; X64-SKX-NEXT: testb $1, %al
; X64-SKX-NEXT: jne .LBB19_1
; X64-SKX-NEXT: # %bb.2: # %else
@@ -1277,8 +1275,7 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
; X86-SKX-LABEL: test20:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; X86-SKX-NEXT: vpmovq2m %xmm2, %k0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm2, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB19_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -1352,8 +1349,7 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
; X64-SKX-LABEL: test21:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; X64-SKX-NEXT: vpmovq2m %xmm2, %k0
-; X64-SKX-NEXT: kmovw %k0, %eax
+; X64-SKX-NEXT: vmovmskpd %xmm2, %eax
; X64-SKX-NEXT: testb $1, %al
; X64-SKX-NEXT: jne .LBB20_1
; X64-SKX-NEXT: # %bb.2: # %else
@@ -1374,8 +1370,7 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
; X86-SKX-LABEL: test21:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; X86-SKX-NEXT: vpmovq2m %xmm2, %k0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm2, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB20_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -1494,10 +1489,9 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
; X86-SKX-LABEL: test22:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; X86-SKX-NEXT: vpmovq2m %xmm1, %k0
; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm1, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB21_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -1617,11 +1611,10 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
; X86-SKX-LABEL: test22a:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; X86-SKX-NEXT: vpmovq2m %xmm1, %k0
; X86-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm1, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB22_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -1741,10 +1734,9 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
; X86-SKX-LABEL: test23:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; X86-SKX-NEXT: vpmovq2m %xmm1, %k0
; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm1, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB23_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -1860,11 +1852,10 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
; X86-SKX-LABEL: test23b:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; X86-SKX-NEXT: vpmovq2m %xmm1, %k0
; X86-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SKX-NEXT: vpslld $2, %xmm0, %xmm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm1, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB24_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -2034,10 +2025,9 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
; X86-SKX-LABEL: test25:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; X86-SKX-NEXT: vpmovq2m %xmm1, %k0
; X86-SKX-NEXT: vpslld $3, %xmm0, %xmm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm1, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB26_1
; X86-SKX-NEXT: # %bb.2: # %else
@@ -3762,10 +3752,9 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
; X86-SKX-LABEL: test_scatter_2i32_index:
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; X86-SKX-NEXT: vpmovq2m %xmm2, %k0
; X86-SKX-NEXT: vpslld $3, %xmm1, %xmm1
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1
-; X86-SKX-NEXT: kmovw %k0, %eax
+; X86-SKX-NEXT: vmovmskpd %xmm2, %eax
; X86-SKX-NEXT: testb $1, %al
; X86-SKX-NEXT: jne .LBB52_1
; X86-SKX-NEXT: # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
index aad1b44344850..5b5280601ea71 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
@@ -164,8 +164,7 @@ define <2 x i32> @test_gather_v2i32_data(<2 x ptr> %ptr, <2 x i1> %mask, <2 x i3
; WIDEN_SKX-LABEL: test_gather_v2i32_data:
; WIDEN_SKX: # %bb.0:
; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0
-; WIDEN_SKX-NEXT: kmovw %k0, %eax
+; WIDEN_SKX-NEXT: vmovmskpd %xmm1, %eax
; WIDEN_SKX-NEXT: testb $1, %al
; WIDEN_SKX-NEXT: jne .LBB2_1
; WIDEN_SKX-NEXT: # %bb.2: # %else
@@ -226,8 +225,7 @@ define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask
; WIDEN_SKX-LABEL: test_scatter_v2i32_data:
; WIDEN_SKX: # %bb.0:
; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0
-; WIDEN_SKX-NEXT: kmovw %k0, %eax
+; WIDEN_SKX-NEXT: vmovmskpd %xmm2, %eax
; WIDEN_SKX-NEXT: testb $1, %al
; WIDEN_SKX-NEXT: jne .LBB3_1
; WIDEN_SKX-NEXT: # %bb.2: # %else
>From b78bc134e9c8a84ebecc617ee2cada87fbe56c83 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Tue, 23 Dec 2025 23:44:07 +1100
Subject: [PATCH 07/10] restore variables accidentally changed
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 379987d162611..37b6e0615d43f 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -273,8 +273,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
if (tryCompressVPMOVPattern(MI, MBB, ST, ToErase))
return true;
- unsigned Opc = MI.getOpcode();
- auto IsRedundantNewDataDest = [&](unsigned &OpcRef) {
+ auto IsRedundantNewDataDest = [&](unsigned &Opc) {
// $rbx = ADD64rr_ND $rbx, $rax / $rbx = ADD64rr_ND $rax, $rbx
// ->
// $rbx = ADD64rr $rbx, $rax
@@ -294,7 +293,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
return false;
// Opcode may change after commute, e.g. SHRD -> SHLD
ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2);
- OpcRef = MI.getOpcode();
+ Opc = MI.getOpcode();
return true;
};
@@ -309,6 +308,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
// For AVX512 cases, EVEX prefix is needed in order to carry this information
// thus preventing the transformation to VEX encoding.
bool IsND = X86II::hasNewDataDest(TSFlags);
+ unsigned Opc = MI.getOpcode();
bool IsSetZUCCm = Opc == X86::SETZUCCm;
if (TSFlags & X86II::EVEX_B && !IsND && !IsSetZUCCm)
return false;
>From 8062e233d7686994735d39688e4da38c77e0bf95 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Tue, 23 Dec 2025 23:46:48 +1100
Subject: [PATCH 08/10] fixup! move evex after VPMOV*2M per suggestion
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 37b6e0615d43f..dbe3bce66dbe4 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -16,7 +16,7 @@
// d. NF_ND (EVEX) -> NF (EVEX)
// e. NonNF (EVEX) -> NF (EVEX)
// f. SETZUCCm (EVEX) -> SETCCm (legacy)
-// g. VPMOV*2M + KMOV (EVEX) -> VMOVMSK (VEX)
+// g. VPMOV*2M (EVEX) + KMOV -> VMOVMSK (VEX)
//
// Compression a, b and c can always reduce code size, with some exceptions
// such as promoted 16-bit CRC32 which is as long as the legacy version.
>From 415368ecb5452cc881d260707e9f3615079cedaf Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Wed, 24 Dec 2025 01:13:06 +1100
Subject: [PATCH 09/10] fixup! do not allow KMOVBrk as truncation is unsafe
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index dbe3bce66dbe4..1464edcce8ad4 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -227,8 +227,8 @@ static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB,
return false; // Fail: Mask has MULTIPLE uses
unsigned UseOpc = CurMI.getOpcode();
- bool IsKMOV = (UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
- UseOpc == X86::KMOVDrk || UseOpc == X86::KMOVQrk);
+ bool IsKMOV = UseOpc == X86::KMOVWrk || UseOpc == X86::KMOVDrk ||
+ UseOpc == X86::KMOVQrk;
if (IsKMOV && CurMI.getOperand(1).getReg() == MaskReg) {
KMovMI = &CurMI;
>From e57f5a79df593eafbb5440e71a03c112faff1ae6 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Wed, 24 Dec 2025 02:18:14 +1100
Subject: [PATCH 10/10] Revert "fixup! do not allow KMOVBrk as truncation is
unsafe"
This reverts commit 415368ecb5452cc881d260707e9f3615079cedaf.
---
llvm/lib/Target/X86/X86CompressEVEX.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 1464edcce8ad4..dbe3bce66dbe4 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -227,8 +227,8 @@ static bool tryCompressVPMOVPattern(MachineInstr &MI, MachineBasicBlock &MBB,
return false; // Fail: Mask has MULTIPLE uses
unsigned UseOpc = CurMI.getOpcode();
- bool IsKMOV = UseOpc == X86::KMOVWrk || UseOpc == X86::KMOVDrk ||
- UseOpc == X86::KMOVQrk;
+ bool IsKMOV = (UseOpc == X86::KMOVBrk || UseOpc == X86::KMOVWrk ||
+ UseOpc == X86::KMOVDrk || UseOpc == X86::KMOVQrk);
if (IsKMOV && CurMI.getOperand(1).getReg() == MaskReg) {
KMovMI = &CurMI;
More information about the llvm-commits
mailing list