[llvm] b0bea80 - [X86] Prefer vmovmsk instead of vtest for alderlake.

Thu Jun 8 02:39:03 PDT 2023

Author: Luo, Yuanke
Date: 2023-06-08T17:38:47+08:00
New Revision: b0bea80ab479e9bb016fcdb62d7d0eceec2b28e3

URL: https://github.com/llvm/llvm-project/commit/b0bea80ab479e9bb016fcdb62d7d0eceec2b28e3
DIFF: https://github.com/llvm/llvm-project/commit/b0bea80ab479e9bb016fcdb62d7d0eceec2b28e3.diff

LOG: [X86] Prefer vmovmsk instead of vtest for alderlake.

On alderlake E-core, the latency of VMOVMSKPS is 5 for YMM/XMM. The
latency of VPTESTPS is 7 for YMM and is 5 for XMM. Since alderlake use
the P-core schedule model, we can't determine which one better based on
the latency information of schedule model. Alternatively we add an
tuning feature for alderlake and select VMOVMSKPS with the indication
for the tuning feature. In the case of "vmovmskps + test + jcc", the
test and jcc can be fused, while vtest and jcc can't.

Differential Revision: https://reviews.llvm.org/D152227

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86.td
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/combine-movmsk-avx.ll
    llvm/test/CodeGen/X86/combine-movmsk.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index d664b24e33cbf..e9f9f1bb142b1 100644

--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -422,6 +422,9 @@ def FeatureHardenSlsIJmp
 //===----------------------------------------------------------------------===//
 // X86 Subtarget Tuning features
 //===----------------------------------------------------------------------===//
+def TuningPreferMovmskOverVTest : SubtargetFeature<"prefer-movmsk-over-vtest",
+                                       "PreferMovmskOverVTest", "true",
+                                       "Prefer movmsk over vtest instruction">;
 
 def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
@@ -1166,7 +1169,8 @@ def ProcessorFeatures {
                                                   FeatureMOVDIRI,
                                                   FeatureMOVDIR64B,
                                                   FeatureWAITPKG];
-  list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps];
+  list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
+                                                TuningPreferMovmskOverVTest];
   list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
   list<SubtargetFeature> ADLFeatures =
     !listconcat(TRMFeatures, ADLAdditionalFeatures);

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4fc96d820bb1b..170396dc7ba9e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48024,7 +48024,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
   // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
   // iff every element is referenced.
-  if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse &&
+  if (NumElts <= CmpBits && Subtarget.hasAVX() &&
+      !Subtarget.preferMovmskOverVTest() && IsOneUse &&
       (NumEltBits == 32 || NumEltBits == 64)) {
     SDLoc DL(EFLAGS);
     MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);

diff  --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
index 871703de6e9f8..b3f4878745193 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL
 
 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
@@ -22,7 +22,8 @@ define i1 @movmskps_noneof_bitcast_v4f64(<4 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; ADL-NEXT:    vtestpd %ymm0, %ymm0
+; ADL-NEXT:    vmovmskpd %ymm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    vzeroupper
 ; ADL-NEXT:    retq
@@ -59,9 +60,9 @@ define i1 @movmskps_allof_bitcast_v4f64(<4 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; ADL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; ADL-NEXT:    vtestpd %ymm1, %ymm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %ymm0, %eax
+; ADL-NEXT:    cmpl $15, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    vzeroupper
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <4 x double> %a0, zeroinitializer
@@ -203,10 +204,10 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
 ; ADL-LABEL: movmskps_concat_v4f32:
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; ADL-NEXT:    vmovmskps %xmm0, %ecx
 ; ADL-NEXT:    xorl %eax, %eax
-; ADL-NEXT:    vtestps %xmm0, %xmm0
-; ADL-NEXT:    setne %al
-; ADL-NEXT:    negl %eax
+; ADL-NEXT:    negl %ecx
+; ADL-NEXT:    sbbl %eax, %eax
 ; ADL-NEXT:    retq
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %1)

diff  --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll
index b365a5fd13825..baa0553a34f48 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
@@ -33,7 +33,8 @@ define i1 @movmskps_noneof_bitcast_v2f64(<2 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
-; ADL-NEXT:    vtestpd %xmm0, %xmm0
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <2 x double> zeroinitializer, %a0
@@ -67,9 +68,9 @@ define i1 @movmskps_allof_bitcast_v2f64(<2 x double> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestpd %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    cmpl $3, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <2 x double> zeroinitializer, %a0
   %2 = sext <2 x i1> %1 to <2 x i64>
@@ -103,7 +104,8 @@ define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) {
 ;
 ; ADL-LABEL: pmovmskb_noneof_bitcast_v2i64:
 ; ADL:       # %bb.0:
-; ADL-NEXT:    vtestpd %xmm0, %xmm0
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = icmp sgt <2 x i64> zeroinitializer, %a0
@@ -139,9 +141,9 @@ define i1 @pmovmskb_allof_bitcast_v2i64(<2 x i64> %a0) {
 ;
 ; ADL-LABEL: pmovmskb_allof_bitcast_v2i64:
 ; ADL:       # %bb.0:
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestpd %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskpd %xmm0, %eax
+; ADL-NEXT:    cmpl $3, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = icmp sgt <2 x i64> zeroinitializer, %a0
   %2 = sext <2 x i1> %1 to <2 x i64>
@@ -173,7 +175,8 @@ define i1 @pmovmskb_noneof_bitcast_v4f32(<4 x float> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vtestps %xmm0, %xmm0
+; ADL-NEXT:    vmovmskps %xmm0, %eax
+; ADL-NEXT:    testl %eax, %eax
 ; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <4 x float> %a0, zeroinitializer
@@ -207,9 +210,9 @@ define i1 @pmovmskb_allof_bitcast_v4f32(<4 x float> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; ADL-NEXT:    vtestps %xmm1, %xmm0
-; ADL-NEXT:    setb %al
+; ADL-NEXT:    vmovmskps %xmm0, %eax
+; ADL-NEXT:    cmpl $15, %eax
+; ADL-NEXT:    sete %al
 ; ADL-NEXT:    retq
   %1 = fcmp oeq <4 x float> %a0, zeroinitializer
   %2 = sext <4 x i1> %1 to <4 x i32>
@@ -513,10 +516,11 @@ define i32 @movmskps_ptest_numelts_mismatch(<16 x i8> %a0) {
 ; ADL:       # %bb.0:
 ; ADL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; ADL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; ADL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; ADL-NEXT:    vmovmskps %xmm0, %ecx
 ; ADL-NEXT:    xorl %eax, %eax
-; ADL-NEXT:    vtestps %xmm1, %xmm0
-; ADL-NEXT:    sbbl %eax, %eax
+; ADL-NEXT:    cmpl $15, %ecx
+; ADL-NEXT:    sete %al
+; ADL-NEXT:    negl %eax
 ; ADL-NEXT:    retq
   %1 = icmp eq <16 x i8> %a0, zeroinitializer
   %2 = sext <16 x i1> %1 to <16 x i8>