[llvm] 3568333 - [AArch64] Perform last active true vector combine

Mon Mar 14 10:28:33 PDT 2022

Author: zhongyunde
Date: 2022-03-15T01:25:03+08:00
New Revision: 3568333815b30dc565ce041c64e871bc1d4e8e21

URL: https://github.com/llvm/llvm-project/commit/3568333815b30dc565ce041c64e871bc1d4e8e21
DIFF: https://github.com/llvm/llvm-project/commit/3568333815b30dc565ce041c64e871bc1d4e8e21.diff

LOG: [AArch64] Perform last active true vector combine

Test bit of lane EC-1 can use P register directly, eg:
Materialize : Idx = (add (mul vscale, NumEls), -1)
               i1 = extract_vector_elt t37, Constant:i64<Idx>
    ... into: "ptrue p, all" + PTEST

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D121180

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-cmp-folds.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0f2d7997c870b..b538e6fff2861 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14410,12 +14410,57 @@ performFirstTrueTestVectorCombine(SDNode *N,
   return getPTest(DAG, N->getValueType(0), Pg, SetCC, AArch64CC::FIRST_ACTIVE);
 }
 
+// Materialize : Idx = (add (mul vscale, NumEls), -1)
+//               i1 = extract_vector_elt t37, Constant:i64<Idx>
+//     ... into: "ptrue p, all" + PTEST
+static SDValue
+performLastTrueTestVectorCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  // Make sure PTEST is legal types.
+  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
+    return SDValue();
+
+  SDValue SetCC = N->getOperand(0);
+  EVT OpVT = SetCC.getValueType();
+
+  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  // Idx == (add (mul vscale, NumEls), -1)
+  SDValue Idx = N->getOperand(1);
+  if (Idx.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue VS = Idx.getOperand(0);
+  if (VS.getOpcode() != ISD::VSCALE)
+    return SDValue();
+
+  unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
+  if (VS.getConstantOperandVal(0) != NumEls)
+    return SDValue();
+
+  // Restricted the DAG combine to only cases where we're extracting from a
+  // flag-setting operation
+  auto *CI = dyn_cast<ConstantSDNode>(Idx.getOperand(1));
+  if (!CI || !CI->isAllOnes() || SetCC.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
+  return getPTest(DAG, N->getValueType(0), Pg, SetCC, AArch64CC::LAST_ACTIVE);
+}
+
 static SDValue
 performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                const AArch64Subtarget *Subtarget) {
   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
   if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
     return Res;
+  if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
+    return Res;
 
   SelectionDAG &DAG = DCI.DAG;
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);

diff  --git a/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll b/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
index a4951c41a3256..90f88b8bbe02b 100644
--- a/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
+++ b/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
@@ -53,8 +53,8 @@ define <vscale x 4 x i1> @not_fcmp_uge_nxv4f32(<vscale x 4 x float> %a, <vscale
   ret <vscale x 4 x i1> %not
 }
 
-define i1 @foo(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
-; CHECK-LABEL: foo:
+define i1 @foo_first(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: foo_first:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
@@ -66,3 +66,21 @@ define i1 @foo(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
   ret i1 %bit
 }
 
+define i1 @foo_last(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: foo_last:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %vcond = fcmp oeq <vscale x 4 x float> %a, %b
+  %vscale = call i64 @llvm.vscale.i64()
+  %shl2 = shl nuw nsw i64 %vscale, 2
+  %idx = add nuw nsw i64 %shl2, -1
+  %bit = extractelement <vscale x 4 x i1> %vcond, i64 %idx
+  ret i1 %bit
+}
+
+
+declare i64 @llvm.vscale.i64()