[llvm] 6b0ed50 - [ARM][MVE] Tail-Predication: recognise (again) active lanes IR pattern

Tue Feb 11 07:18:50 PST 2020

Author: Sjoerd Meijer
Date: 2020-02-11T15:18:18Z
New Revision: 6b0ed508fa3947ec1f3a1bd87a08b598e98423e3

URL: https://github.com/llvm/llvm-project/commit/6b0ed508fa3947ec1f3a1bd87a08b598e98423e3
DIFF: https://github.com/llvm/llvm-project/commit/6b0ed508fa3947ec1f3a1bd87a08b598e98423e3.diff

LOG: [ARM][MVE] Tail-Predication: recognise (again) active lanes IR pattern

A small IR change in calculating the active lanes resulted in no longer
recognising tail-predication. Now recognise both an 'add' and 'or' in
the expression that calculates the active lanes.

Differential Revision: https://reviews.llvm.org/D74394

Added: 
    

Modified: 
    llvm/lib/Target/ARM/MVETailPredication.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 151385de7850..9b8c437c053b 100644

--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -260,13 +260,18 @@ bool MVETailPredication::isTailPredicate(TripCountPattern &TCP) {
   // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
   //                                  <4 x i32> undef,
   //                                  <4 x i32> zeroinitializer
-  // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  // %induction = [add|or] <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
-
+  //
+  // Please note that the 'or' is equivalent to the 'and' here, this relies on
+  // BroadcastSplat being the IV which we know is a phi with 0 start and Lanes
+  // increment, which is all being checked below.
   Instruction *BroadcastSplat = nullptr;
   Constant *Const = nullptr;
   if (!match(TCP.Induction,
-             m_Add(m_Instruction(BroadcastSplat), m_Constant(Const))))
+             m_Add(m_Instruction(BroadcastSplat), m_Constant(Const))) &&
+      !match(TCP.Induction,
+             m_Or(m_Instruction(BroadcastSplat), m_Constant(Const))))
     return false;
 
   // Check that we're adding <0, 1, 2, 3...

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
index 257d950c60fb..ad7920007267 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -32,7 +32,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
-  %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp = getelementptr inbounds i8, i8* %a, i32 %index
   %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i8* %tmp to <16 x i8>*
@@ -137,7 +137,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*