[llvm] 69cccb3 - [SVE] Fix isLoadInvariantInLoop for scalable vectors

Tue Sep 15 00:30:40 PDT 2020

Author: David Sherwood
Date: 2020-09-15T08:30:19+01:00
New Revision: 69cccb3189d6e0535ab78411a37cfcccf06a58a7

URL: https://github.com/llvm/llvm-project/commit/69cccb3189d6e0535ab78411a37cfcccf06a58a7
DIFF: https://github.com/llvm/llvm-project/commit/69cccb3189d6e0535ab78411a37cfcccf06a58a7.diff

LOG: [SVE] Fix isLoadInvariantInLoop for scalable vectors

I've amended the isLoadInvariantInLoop function to bail out for
scalable vectors for now since the invariant.start intrinsic is only
ever generated by the clang frontend for thread locals or struct
and class constructors, neither of which support sizeless types.
In addition, the intrinsic itself does not currently support the
concept of a scaled size, which makes it impossible to compare
the sizes of different scalable objects, e.g. <vscale x 32 x i8>
and <vscale x 16 x i8>.

Added new tests here:

  Transforms/LICM/AArch64/sve-load-hoist.ll
  Transforms/LICM/hoisting.ll

Differential Revision: https://reviews.llvm.org/D87227

Added: 
    llvm/test/Transforms/LICM/AArch64/lit.local.cfg
    llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll

Modified: 
    llvm/lib/IR/Verifier.cpp
    llvm/lib/Transforms/Scalar/LICM.cpp
    llvm/test/Transforms/LICM/hoisting.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 783c492dbeae..a5baa2bf1631 100644

--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5010,6 +5010,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
     break;
   }
+  case Intrinsic::invariant_start: {
+    ConstantInt *InvariantSize = dyn_cast<ConstantInt>(Call.getArgOperand(0));
+    Assert(InvariantSize &&
+               (!InvariantSize->isNegative() || InvariantSize->isMinusOne()),
+           "invariant_start parameter must be -1, 0 or a positive number",
+           &Call);
+    break;
+  }
   case Intrinsic::matrix_multiply:
   case Intrinsic::matrix_transpose:
   case Intrinsic::matrix_column_major_load:

diff  --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 4bf39ba8f151..b741d36e37bf 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -940,7 +940,19 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
                                   Loop *CurLoop) {
   Value *Addr = LI->getOperand(0);
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+  const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+
+  // It is not currently possible for clang to generate an invariant.start
+  // intrinsic with scalable vector types because we don't support thread local
+  // sizeless types and we don't permit sizeless types in structs or classes.
+  // Furthermore, even if support is added for this in future the intrinsic
+  // itself is defined to have a size of -1 for variable sized objects. This
+  // makes it impossible to verify if the intrinsic envelops our region of
+  // interest. For example, both <vscale x 32 x i8> and <vscale x 16 x i8>
+  // types would have a -1 parameter, but the former is clearly double the size
+  // of the latter.
+  if (LocSizeInBits.isScalable())
+    return false;
 
   // if the type is i8 addrspace(x)*, we know this is the type of
   // llvm.invariant.start operand
@@ -970,13 +982,17 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
     if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
         !II->use_empty())
       continue;
-    unsigned InvariantSizeInBits =
-        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    ConstantInt *InvariantSize = cast<ConstantInt>(II->getArgOperand(0));
+    // The intrinsic supports having a -1 argument for variable sized objects
+    // so we should check for that here.
+    if (InvariantSize->isNegative())
+      continue;
+    uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8;
     // Confirm the invariant.start location size contains the load operand size
     // in bits. Also, the invariant.start should dominate the load, and we
     // should not hoist the load out of a loop that contains this dominating
     // invariant.start.
-    if (LocSizeInBits <= InvariantSizeInBits &&
+    if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits &&
         DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
       return true;
   }

diff  --git a/llvm/test/Transforms/LICM/AArch64/lit.local.cfg b/llvm/test/Transforms/LICM/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..7184443994b6
--- /dev/null
+++ b/llvm/test/Transforms/LICM/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True

diff  --git a/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll b/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll
new file mode 100644
index 000000000000..b0fcdb7d8dfc
--- /dev/null
+++ b/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll
@@ -0,0 +1,30 @@
+; RUN: opt -licm -mtriple aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s
+
+define void @no_hoist_load1_nxv2i64(<vscale x 2 x i64>* %out, i8* %in8, i32 %n) {
+; CHECK-LABEL: @no_hoist_load1_nxv2i64(
+; CHECK: entry:
+; CHECK-NOT: load
+; CHECK: for.body:
+; CHECK: load
+entry:
+  %cmp0 = icmp ugt i32 %n, 0
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 16, i8* %in8)
+  %in = bitcast i8* %in8 to <vscale x 2 x i64>*
+  br i1 %cmp0, label %for.body, label %for.end
+
+for.body:
+  %i = phi i32 [0, %entry], [%inc, %for.body]
+  %i2 = zext i32 %i to i64
+  %ptr = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %out, i64 %i2
+  %val = load <vscale x 2 x i64>, <vscale x 2 x i64>* %in, align 16
+  store <vscale x 2 x i64> %val, <vscale x 2 x i64>* %ptr, align 16
+  %inc = add nuw nsw i32 %i, 1
+  %cmp = icmp ult i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+

diff  --git a/llvm/test/Transforms/LICM/hoisting.ll b/llvm/test/Transforms/LICM/hoisting.ll
index 97609fa397e4..00ac0f5756de 100644
--- a/llvm/test/Transforms/LICM/hoisting.ll
+++ b/llvm/test/Transforms/LICM/hoisting.ll
@@ -360,3 +360,36 @@ loop:
 loopexit:
   ret i32 %sum
 }
+
+; We can't hoist the invariant load out of the loop because
+; the marker is given a variable size (-1).
+define i32 @test_fence5(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence5
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 -1, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}