[llvm] becaa68 - [ARM] Constant fold VCTP intrinsics

Tue Jul 21 04:03:55 PDT 2020

Author: David Green
Date: 2020-07-21T11:39:31+01:00
New Revision: becaa6803ab532d15506829f0551a5fa49c39d7e

URL: https://github.com/llvm/llvm-project/commit/becaa6803ab532d15506829f0551a5fa49c39d7e
DIFF: https://github.com/llvm/llvm-project/commit/becaa6803ab532d15506829f0551a5fa49c39d7e.diff

LOG: [ARM] Constant fold VCTP intrinsics

We can sometimes get into the situation where the operand to a vctp
intrinsic becomes constant, such as after a loop is fully unrolled. This
adds the constant folding needed for them, allowing them to simplify
away and hopefully simplifying remaining instructions.

Differential Revision: https://reviews.llvm.org/D84110

Added: 
    llvm/test/Analysis/ConstantFolding/ARM/lit.local.cfg
    llvm/test/Analysis/ConstantFolding/ARM/mve-vctp.ll

Modified: 
    llvm/lib/Analysis/ConstantFolding.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 6feffcbb98e1..7c3e35be5074 100644

--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
@@ -1456,6 +1457,11 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::experimental_vector_reduce_smax:
   case Intrinsic::experimental_vector_reduce_umin:
   case Intrinsic::experimental_vector_reduce_umax:
+  // Target intrinsics
+  case Intrinsic::arm_mve_vctp8:
+  case Intrinsic::arm_mve_vctp16:
+  case Intrinsic::arm_mve_vctp32:
+  case Intrinsic::arm_mve_vctp64:
     return true;
 
   // Floating point operations cannot be folded in strictfp functions in
@@ -2719,7 +2725,8 @@ static Constant *ConstantFoldVectorCall(StringRef Name,
   SmallVector<Constant *, 4> Lane(Operands.size());
   Type *Ty = FVTy->getElementType();
 
-  if (IntrinsicID == Intrinsic::masked_load) {
+  switch (IntrinsicID) {
+  case Intrinsic::masked_load: {
     auto *SrcPtr = Operands[0];
     auto *Mask = Operands[2];
     auto *Passthru = Operands[3];
@@ -2757,6 +2764,32 @@ static Constant *ConstantFoldVectorCall(StringRef Name,
       return nullptr;
     return ConstantVector::get(NewElements);
   }
+  case Intrinsic::arm_mve_vctp8:
+  case Intrinsic::arm_mve_vctp16:
+  case Intrinsic::arm_mve_vctp32:
+  case Intrinsic::arm_mve_vctp64: {
+    if (auto *Op = dyn_cast<ConstantInt>(Operands[0])) {
+      unsigned Lanes = FVTy->getNumElements();
+      uint64_t Limit = Op->getZExtValue();
+      // vctp64 are currently modelled as returning a v4i1, not a v2i1. Make
+      // sure we get the limit right in that case and set all relevant lanes.
+      if (IntrinsicID == Intrinsic::arm_mve_vctp64)
+        Limit *= 2;
+
+      SmallVector<Constant *, 16> NCs;
+      for (unsigned i = 0; i < Lanes; i++) {
+        if (i < Limit)
+          NCs.push_back(ConstantInt::getTrue(Ty));
+        else
+          NCs.push_back(ConstantInt::getFalse(Ty));
+      }
+      return ConstantVector::get(NCs);
+    }
+    break;
+  }
+  default:
+    break;
+  }
 
   for (unsigned I = 0, E = FVTy->getNumElements(); I != E; ++I) {
     // Gather a column of constants.

diff  --git a/llvm/test/Analysis/ConstantFolding/ARM/lit.local.cfg b/llvm/test/Analysis/ConstantFolding/ARM/lit.local.cfg
new file mode 100644
index 000000000000..236e1d344166
--- /dev/null
+++ b/llvm/test/Analysis/ConstantFolding/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True

diff  --git a/llvm/test/Analysis/ConstantFolding/ARM/mve-vctp.ll b/llvm/test/Analysis/ConstantFolding/ARM/mve-vctp.ll
new file mode 100644
index 000000000000..0651a489e35a
--- /dev/null
+++ b/llvm/test/Analysis/ConstantFolding/ARM/mve-vctp.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instsimplify -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define <16 x i1> @vctp8_0() {
+; CHECK-LABEL: @vctp8_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <16 x i1> zeroinitializer
+;
+entry:
+  %int = call <16 x i1> @llvm.arm.mve.vctp8(i32 0)
+  ret <16 x i1> %int
+}
+
+define <16 x i1> @vctp8_1() {
+; CHECK-LABEL: @vctp8_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+entry:
+  %int = call <16 x i1> @llvm.arm.mve.vctp8(i32 1)
+  ret <16 x i1> %int
+}
+
+define <16 x i1> @vctp8_8() {
+; CHECK-LABEL: @vctp8_8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+entry:
+  %int = call <16 x i1> @llvm.arm.mve.vctp8(i32 8)
+  ret <16 x i1> %int
+}
+
+define <16 x i1> @vctp8_15() {
+; CHECK-LABEL: @vctp8_15(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>
+;
+entry:
+  %int = call <16 x i1> @llvm.arm.mve.vctp8(i32 15)
+  ret <16 x i1> %int
+}
+
+define <16 x i1> @vctp8_16() {
+; CHECK-LABEL: @vctp8_16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <16 x i1> @llvm.arm.mve.vctp8(i32 16)
+  ret <16 x i1> %int
+}
+
+define <16 x i1> @vctp8_100() {
+; CHECK-LABEL: @vctp8_100(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <16 x i1> @llvm.arm.mve.vctp8(i32 100)
+  ret <16 x i1> %int
+}
+
+define <16 x i1> @vctp8_m1() {
+; CHECK-LABEL: @vctp8_m1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <16 x i1> @llvm.arm.mve.vctp8(i32 -1)
+  ret <16 x i1> %int
+}
+
+
+
+define <8 x i1> @vctp16_0() {
+; CHECK-LABEL: @vctp16_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <8 x i1> zeroinitializer
+;
+entry:
+  %int = call <8 x i1> @llvm.arm.mve.vctp16(i32 0)
+  ret <8 x i1> %int
+}
+
+define <8 x i1> @vctp16_1() {
+; CHECK-LABEL: @vctp16_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+entry:
+  %int = call <8 x i1> @llvm.arm.mve.vctp16(i32 1)
+  ret <8 x i1> %int
+}
+
+define <8 x i1> @vctp16_4() {
+; CHECK-LABEL: @vctp16_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>
+;
+entry:
+  %int = call <8 x i1> @llvm.arm.mve.vctp16(i32 4)
+  ret <8 x i1> %int
+}
+
+define <8 x i1> @vctp16_7() {
+; CHECK-LABEL: @vctp16_7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false>
+;
+entry:
+  %int = call <8 x i1> @llvm.arm.mve.vctp16(i32 7)
+  ret <8 x i1> %int
+}
+
+define <8 x i1> @vctp16_8() {
+; CHECK-LABEL: @vctp16_8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <8 x i1> @llvm.arm.mve.vctp16(i32 8)
+  ret <8 x i1> %int
+}
+
+define <8 x i1> @vctp16_100() {
+; CHECK-LABEL: @vctp16_100(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <8 x i1> @llvm.arm.mve.vctp16(i32 100)
+  ret <8 x i1> %int
+}
+
+define <8 x i1> @vctp16_m1() {
+; CHECK-LABEL: @vctp16_m1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <8 x i1> @llvm.arm.mve.vctp16(i32 -1)
+  ret <8 x i1> %int
+}
+
+
+
+define <4 x i1> @vctp32_0() {
+; CHECK-LABEL: @vctp32_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> zeroinitializer
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp32(i32 0)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp32_1() {
+; CHECK-LABEL: @vctp32_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 false, i1 false, i1 false>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp32(i32 1)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp32_3() {
+; CHECK-LABEL: @vctp32_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 false>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp32(i32 3)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp32_4() {
+; CHECK-LABEL: @vctp32_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp32(i32 4)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp32_100() {
+; CHECK-LABEL: @vctp32_100(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp32(i32 100)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp32_m1() {
+; CHECK-LABEL: @vctp32_m1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp32(i32 -1)
+  ret <4 x i1> %int
+}
+
+
+
+define <4 x i1> @vctp64_0() {
+; CHECK-LABEL: @vctp64_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> zeroinitializer
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 0)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp64_1() {
+; CHECK-LABEL: @vctp64_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 false, i1 false>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 1)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp64_2() {
+; CHECK-LABEL: @vctp64_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 2)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp64_100() {
+; CHECK-LABEL: @vctp64_100(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 100)
+  ret <4 x i1> %int
+}
+
+define <4 x i1> @vctp64_m1() {
+; CHECK-LABEL: @vctp64_m1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
+;
+entry:
+  %int = call <4 x i1> @llvm.arm.mve.vctp64(i32 -1)
+  ret <4 x i1> %int
+}
+
+
+
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)