[llvm] VectorCombine: lift one-use limitation in foldExtractedCmps (PR #110902)

Thu Oct 10 03:23:08 PDT 2024

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/110902

>From 1e8b3754f220a2cf4c8af44cd57b7a688cb07240 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 2 Oct 2024 17:54:49 +0100
Subject: [PATCH 1/3] VectorCombine: lift one-use limitation in a folder

There are artificial one-use limitations on foldExtractedCmps. Adjust
the costs to account for multi-use, and strip the one-use matcher,
lifting the limitations.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 24 ++++----
 .../VectorCombine/X86/extract-cmp-binop.ll    | 57 +++++++++++++++++++
 2 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a2ab5d96664078..be1abd16843fd6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1038,23 +1038,20 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
 
   // The compare predicates should match, and each compare should have a
   // constant operand.
-  // TODO: Relax the one-use constraints.
   Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
   Instruction *I0, *I1;
   Constant *C0, *C1;
   CmpInst::Predicate P0, P1;
-  if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
-      !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
-      P0 != P1)
+  if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
+      !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) || P0 != P1)
     return false;
 
   // The compare operands must be extracts of the same vector with constant
   // extract indexes.
-  // TODO: Relax the one-use constraints.
   Value *X;
   uint64_t Index0, Index1;
-  if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
-      !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
+  if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
+      !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
     return false;
 
   auto *Ext0 = cast<ExtractElementInst>(I0);
@@ -1073,14 +1070,16 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
     return false;
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Ext0Cost =
+                      TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0),
+                  Ext1Cost =
+                      TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
   InstructionCost OldCost =
-      TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
-  OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
-  OldCost +=
+      Ext0Cost + Ext1Cost +
       TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
                              CmpInst::makeCmpResultType(I0->getType()), Pred) *
-      2;
-  OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+          2 +
+      TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
 
   // The proposed vector pattern is:
   // vcmp = cmp Pred X, VecC
@@ -1096,6 +1095,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
                                 ShufMask);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
   NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
+  NewCost += !Ext0->hasOneUse() * Ext0Cost + !Ext1->hasOneUse() * Ext1Cost;
 
   // Aggressively form vector ops if the cost is equal because the transform
   // may enable further optimization.
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
index 462bb13ae7d12a..da0b5b8358e01b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@@ -92,6 +92,63 @@ define i1 @icmp_add_v8i32(<8 x i32> %a) {
   ret i1 %r
 }
 
+declare void @use.i1(i1)
+declare void @use.i32(i32)
+declare void @use.double(double)
+
+
+define i1 @fcmp_and_v2f64_multiuse(<2 x double> %a) {
+; SSE-LABEL: @fcmp_and_v2f64_multiuse(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    call void @use.double(double [[E1]])
+; SSE-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    call void @use.i1(i1 [[R]])
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_and_v2f64_multiuse(
+; AVX-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; AVX-NEXT:    call void @use.double(double [[E1]])
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A]], <double 4.200000e+01, double -8.000000e+00>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    call void @use.i1(i1 [[R]])
+; AVX-NEXT:    ret i1 [[R]]
+;
+  %e1 = extractelement <2 x double> %a, i32 0
+  call void @use.double(double %e1)
+  %e2 = extractelement <2 x double> %a, i32 1
+  %cmp1 = fcmp olt double %e1, 42.0
+  %cmp2 = fcmp olt double %e2, -8.0
+  %r = and i1 %cmp1, %cmp2
+  call void @use.i1(i1 %r)
+  ret i1 %r
+}
+
+define i1 @icmp_xor_v4i32_multiuse(<4 x i32> %a) {
+; CHECK-LABEL: @icmp_xor_v4i32_multiuse(
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    call void @use.i32(i32 [[E2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A]], <i32 poison, i32 -8, i32 poison, i32 42>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    call void @use.i1(i1 [[R]])
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %e1 = extractelement <4 x i32> %a, i32 3
+  %e2 = extractelement <4 x i32> %a, i32 1
+  call void @use.i32(i32 %e2)
+  %cmp1 = icmp sgt i32 %e1, 42
+  %cmp2 = icmp sgt i32 %e2, -8
+  %r = xor i1 %cmp1, %cmp2
+  call void @use.i1(i1 %r)
+  ret i1 %r
+}
+
 ; Negative test - this could CSE/simplify.
 
 define i1 @same_extract_index(<4 x i32> %a) {

>From e143a8ace3c32fe2c41001019ff74c94e365dcaa Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Fri, 4 Oct 2024 17:41:37 +0100
Subject: [PATCH 2/3] extract-cmp-binop: simplify added tests

---
 .../VectorCombine/X86/extract-cmp-binop.ll    | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
index da0b5b8358e01b..be5359f549ac94 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@@ -92,60 +92,57 @@ define i1 @icmp_add_v8i32(<8 x i32> %a) {
   ret i1 %r
 }
 
-declare void @use.i1(i1)
-declare void @use.i32(i32)
-declare void @use.double(double)
-
+declare void @use()
 
 define i1 @fcmp_and_v2f64_multiuse(<2 x double> %a) {
 ; SSE-LABEL: @fcmp_and_v2f64_multiuse(
 ; SSE-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; SSE-NEXT:    call void @use.double(double [[E1]])
+; SSE-NEXT:    call void @use(double [[E1]])
 ; SSE-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
 ; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
 ; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
 ; SSE-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
-; SSE-NEXT:    call void @use.i1(i1 [[R]])
+; SSE-NEXT:    call void @use(i1 [[R]])
 ; SSE-NEXT:    ret i1 [[R]]
 ;
 ; AVX-LABEL: @fcmp_and_v2f64_multiuse(
 ; AVX-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; AVX-NEXT:    call void @use.double(double [[E1]])
+; AVX-NEXT:    call void @use(double [[E1]])
 ; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A]], <double 4.200000e+01, double -8.000000e+00>
 ; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
 ; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
 ; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
-; AVX-NEXT:    call void @use.i1(i1 [[R]])
+; AVX-NEXT:    call void @use(i1 [[R]])
 ; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <2 x double> %a, i32 0
-  call void @use.double(double %e1)
+  call void @use(double %e1)
   %e2 = extractelement <2 x double> %a, i32 1
   %cmp1 = fcmp olt double %e1, 42.0
   %cmp2 = fcmp olt double %e2, -8.0
   %r = and i1 %cmp1, %cmp2
-  call void @use.i1(i1 %r)
+  call void @use(i1 %r)
   ret i1 %r
 }
 
 define i1 @icmp_xor_v4i32_multiuse(<4 x i32> %a) {
 ; CHECK-LABEL: @icmp_xor_v4i32_multiuse(
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 1
-; CHECK-NEXT:    call void @use.i32(i32 [[E2]])
+; CHECK-NEXT:    call void @use(i32 [[E2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A]], <i32 poison, i32 -8, i32 poison, i32 42>
 ; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]]
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
-; CHECK-NEXT:    call void @use.i1(i1 [[R]])
+; CHECK-NEXT:    call void @use(i1 [[R]])
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <4 x i32> %a, i32 3
   %e2 = extractelement <4 x i32> %a, i32 1
-  call void @use.i32(i32 %e2)
+  call void @use(i32 %e2)
   %cmp1 = icmp sgt i32 %e1, 42
   %cmp2 = icmp sgt i32 %e2, -8
   %r = xor i1 %cmp1, %cmp2
-  call void @use.i1(i1 %r)
+  call void @use(i1 %r)
   ret i1 %r
 }
 

>From 6f64a9a5820e8644c546801bc10e1b6f54d3d6df Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 10 Oct 2024 11:22:24 +0100
Subject: [PATCH 3/3] VectorCombine: fix nit with bool arithmetic

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index be1abd16843fd6..a65ec1b734fe02 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1095,7 +1095,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
                                 ShufMask);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
   NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
-  NewCost += !Ext0->hasOneUse() * Ext0Cost + !Ext1->hasOneUse() * Ext1Cost;
+  NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
+  NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
 
   // Aggressively form vector ops if the cost is equal because the transform
   // may enable further optimization.