[llvm] 04ba80c - [Instcombiner]Improve emission of logical or/and reductions.

Thu Mar 4 08:01:56 PST 2021

Author: Alexey Bataev
Date: 2021-03-04T08:01:02-08:00
New Revision: 04ba80ca4dee39ce5176ea241d076e5493ecbebe

URL: https://github.com/llvm/llvm-project/commit/04ba80ca4dee39ce5176ea241d076e5493ecbebe
DIFF: https://github.com/llvm/llvm-project/commit/04ba80ca4dee39ce5176ea241d076e5493ecbebe.diff

LOG: [Instcombiner]Improve emission of logical or/and reductions.

For logical or/and reductions we emit regular intrinsics @llvm.vector.reduce.or/and.vxi1 calls.
These intrinsics are not effective for the logical or/and reductions,
especially if the optimizer is able to emit short circuit versions of
the scalar or/and instructions and vector code gets less effective than
the scalar version.
Instead, or reduction for i1 can be represented as:
```
%val = bitcast <ReduxWidth x i1> to iReduxWidth
%res = cmp ne iReduxWidth %val, 0
```
and reduction for i1 can be represented as:
```
%val = bitcast <ReduxWidth x i1> to iReduxWidth
%res = cmp eq iReduxWidth %val, 11111
```
This improves perfromance of the vector code significantly and make it
to outperform short circuit scalar code.

Part of D57059.

Differential Revision: https://reviews.llvm.org/D97406

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/test/Transforms/InstCombine/vector-logical-reductions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 24f5eba1a15e..3e68cc30f3c1 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1799,6 +1799,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_and: {
+    // Canonicalize logical or/and reductions:
+    // Or reduction for i1 is represented as:
+    // %val = bitcast <ReduxWidth x i1> to iReduxWidth
+    // %res = cmp ne iReduxWidth %val, 0
+    // And reduction for i1 is represented as:
+    // %val = bitcast <ReduxWidth x i1> to iReduxWidth
+    // %res = cmp eq iReduxWidth %val, 11111
+    Value *Arg = II->getArgOperand(0);
+    Type *RetTy = II->getType();
+    if (RetTy == Builder.getInt1Ty())
+      if (auto *FVTy = dyn_cast<FixedVectorType>(Arg->getType())) {
+        Value *Res = Builder.CreateBitCast(
+            Arg, Builder.getIntNTy(FVTy->getNumElements()));
+        if (IID == Intrinsic::vector_reduce_and) {
+          Res = Builder.CreateICmpEQ(
+              Res, ConstantInt::getAllOnesValue(Res->getType()));
+        } else {
+          assert(IID == Intrinsic::vector_reduce_or &&
+                 "Expected or reduction.");
+          Res = Builder.CreateIsNotNull(Res);
+        }
+        replaceInstUsesWith(CI, Res);
+        return eraseInstFromFunction(CI);
+      }
+    break;
+  }
   default: {
     // Handle target specific intrinsics
     Optional<Instruction *> V = targetInstCombineIntrinsic(*II);

diff  --git a/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll b/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll
index 79933bcf3145..f8127765a5ba 100644
--- a/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll
+++ b/llvm/test/Transforms/InstCombine/vector-logical-reductions.ll
@@ -3,8 +3,9 @@
 
 define i1 @reduction_logical_or(<4 x i1> %x) {
 ; CHECK-LABEL: @reduction_logical_or(
-; CHECK-NEXT:    [[R:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[X:%.*]])
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
   %r = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
   ret i1 %r
@@ -12,8 +13,9 @@ define i1 @reduction_logical_or(<4 x i1> %x) {
 
 define i1 @reduction_logical_and(<4 x i1> %x) {
 ; CHECK-LABEL: @reduction_logical_and(
-; CHECK-NEXT:    [[R:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[X:%.*]])
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i4 [[TMP1]], -1
+; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
   %r = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
   ret i1 %r