[llvm] r363160 - LoopDistribute/LAA: Respect convergent

Wed Jun 12 06:34:20 PDT 2019

Author: arsenm
Date: Wed Jun 12 06:34:19 2019
New Revision: 363160

URL: http://llvm.org/viewvc/llvm-project?rev=363160&view=rev
Log:
LoopDistribute/LAA: Respect convergent

This case is slightly tricky, because loop distribution should be
allowed in some cases, and not others. As long as runtime dependency
checks don't need to be introduced, this should be OK. This is further
complicated by the fact that LoopDistribute partially ignores if LAA
says that vectorization is safe, and then does its own runtime pointer
legality checks.

Note this pass still does not handle noduplicate correctly, as this
should always be forbidden with it. I'm not going to bother trying to
fix it, as it would require more effort and I think noduplicate should
be removed.

https://reviews.llvm.org/D62607

Added:
    llvm/trunk/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll
    llvm/trunk/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll
Modified:
    llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
    llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
    llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp
    llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll
    llvm/trunk/test/Transforms/LoopDistribute/basic.ll
    llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll
    llvm/trunk/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll

Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h?rev=363160&r1=363159&r2=363160&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
+++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Wed Jun 12 06:34:19 2019
@@ -522,6 +522,11 @@ public:
   /// no memory dependence cycles.
   bool canVectorizeMemory() const { return CanVecMem; }
 
+  /// Return true if there is a convergent operation in the loop. There may
+  /// still be reported runtime pointer checks that would be required, but it is
+  /// not legal to insert them.
+  bool hasConvergentOp() const { return HasConvergentOp; }
+
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return PtrRtChecking.get();
   }
@@ -642,6 +647,7 @@ private:
 
   /// Cache the result of analyzeLoop.
   bool CanVecMem;
+  bool HasConvergentOp;
 
   /// Indicator that there are non vectorizable stores to a uniform address.
   bool HasDependenceInvolvingLoopInvariantAddress;

Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp?rev=363160&r1=363159&r2=363160&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
+++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Wed Jun 12 06:34:19 2019
@@ -1778,6 +1778,11 @@ void LoopAccessInfo::analyzeLoop(AliasAn
   unsigned NumReads = 0;
   unsigned NumReadWrites = 0;
 
+  bool HasComplexMemInst = false;
+
+  // A runtime check is only legal to insert if there are no convergent calls.
+  HasConvergentOp = false;
+
   PtrRtChecking->Pointers.clear();
   PtrRtChecking->Need = false;
 
@@ -1785,8 +1790,25 @@ void LoopAccessInfo::analyzeLoop(AliasAn
 
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    // Scan the BB and collect legal loads and stores.
+    // Scan the BB and collect legal loads and stores. Also detect any
+    // convergent instructions.
     for (Instruction &I : *BB) {
+      if (auto *Call = dyn_cast<CallBase>(&I)) {
+        if (Call->isConvergent())
+          HasConvergentOp = true;
+      }
+
+      // With both a non-vectorizable memory instruction and a convergent
+      // operation, found in this loop, no reason to continue the search.
+      if (HasComplexMemInst && HasConvergentOp) {
+        CanVecMem = false;
+        return;
+      }
+
+      // Avoid hitting recordAnalysis multiple times.
+      if (HasComplexMemInst)
+        continue;
+
       // If this is a load, save it. If this instruction can read from memory
       // but is not a load, then we quit. Notice that we don't handle function
       // calls that read or write.
@@ -1805,12 +1827,18 @@ void LoopAccessInfo::analyzeLoop(AliasAn
           continue;
 
         auto *Ld = dyn_cast<LoadInst>(&I);
-        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+        if (!Ld) {
+          recordAnalysis("CantVectorizeInstruction", Ld)
+            << "instruction cannot be vectorized";
+          HasComplexMemInst = true;
+          continue;
+        }
+        if (!Ld->isSimple() && !IsAnnotatedParallel) {
           recordAnalysis("NonSimpleLoad", Ld)
               << "read with atomic ordering or volatile read";
           LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
-          CanVecMem = false;
-          return;
+          HasComplexMemInst = true;
+          continue;
         }
         NumLoads++;
         Loads.push_back(Ld);
@@ -1826,15 +1854,15 @@ void LoopAccessInfo::analyzeLoop(AliasAn
         if (!St) {
           recordAnalysis("CantVectorizeInstruction", St)
               << "instruction cannot be vectorized";
-          CanVecMem = false;
-          return;
+          HasComplexMemInst = true;
+          continue;
         }
         if (!St->isSimple() && !IsAnnotatedParallel) {
           recordAnalysis("NonSimpleStore", St)
               << "write with atomic ordering or volatile write";
           LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
-          CanVecMem = false;
-          return;
+          HasComplexMemInst = true;
+          continue;
         }
         NumStores++;
         Stores.push_back(St);
@@ -1845,6 +1873,11 @@ void LoopAccessInfo::analyzeLoop(AliasAn
     } // Next instr.
   } // Next block.
 
+  if (HasComplexMemInst) {
+    CanVecMem = false;
+    return;
+  }
+
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
 
@@ -1962,7 +1995,7 @@ void LoopAccessInfo::analyzeLoop(AliasAn
   }
 
   LLVM_DEBUG(
-      dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
+    dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
 
   CanVecMem = true;
   if (Accesses.isDependencyCheckNeeded()) {
@@ -1997,6 +2030,15 @@ void LoopAccessInfo::analyzeLoop(AliasAn
     }
   }
 
+  if (HasConvergentOp) {
+    recordAnalysis("CantInsertRuntimeCheckWithConvergent")
+      << "cannot add control dependency to convergent operation";
+    LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
+                         "would be needed with a convergent operation\n");
+    CanVecMem = false;
+    return;
+  }
+
   if (CanVecMem)
     LLVM_DEBUG(
         dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
@@ -2285,6 +2327,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L,
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
+      HasConvergentOp(false),
       HasDependenceInvolvingLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
@@ -2301,6 +2344,9 @@ void LoopAccessInfo::print(raw_ostream &
     OS << "\n";
   }
 
+  if (HasConvergentOp)
+    OS.indent(Depth) << "Has convergent operation in loop\n";
+
   if (Report)
     OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
 

Modified: llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp?rev=363160&r1=363159&r2=363160&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp Wed Jun 12 06:34:19 2019
@@ -766,8 +766,14 @@ public:
                     "cannot isolate unsafe dependencies");
     }
 
-    // Don't distribute the loop if we need too many SCEV run-time checks.
+    // Don't distribute the loop if we need too many SCEV run-time checks, or
+    // any if it's illegal.
     const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+    if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
+      return fail("RuntimeCheckWithConvergent",
+                  "may not insert runtime check with convergent operation");
+    }
+
     if (Pred.getComplexity() > (IsForced.getValueOr(false)
                                     ? PragmaDistributeSCEVCheckThreshold
                                     : DistributeSCEVCheckThreshold))
@@ -795,7 +801,14 @@ public:
     auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
                                                   RtPtrChecking);
 
+    if (LAI->hasConvergentOp() && !Checks.empty()) {
+      return fail("RuntimeCheckWithConvergent",
+                  "may not insert runtime check with convergent operation");
+    }
+
     if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+      assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
+
       MDNode *OrigLoopID = L->getLoopID();
 
       LLVM_DEBUG(dbgs() << "\nPointers:\n");

Added: llvm/trunk/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll?rev=363160&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll (added)
+++ llvm/trunk/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll Wed Jun 12 06:34:19 2019
@@ -0,0 +1,73 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck %s
+
+; Analyze this loop:
+;   for (i = 0; i < n; i++)
+;    A[i + 1] = A[i] * B[i] * C[i];
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: for.body:
+; CHECK: Has convergent operation in loop
+; CHECK: Report: cannot add control dependency to convergent operation
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT:   Backward:
+; CHECK-NEXT:     %loadA = load i16, i16* %arrayidxA, align 2 ->
+; CHECK-NEXT:     store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+; CHECK: Run-time memory checks:
+; CHECK-NEXT: 0:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT:   %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+; CHECK-NEXT: 1:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT:   %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+
+ at B = common global i16* null, align 8
+ at A = common global i16* null, align 8
+ at C = common global i16* null, align 8
+
+define void @f() #1 {
+entry:
+  %a = load i16*, i16** @A, align 8
+  %b = load i16*, i16** @B, align 8
+  %c = load i16*, i16** @C, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+  %loadA = load i16, i16* %arrayidxA, align 2
+
+  %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+  %loadB = load i16, i16* %arrayidxB, align 2
+
+  %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+  %loadC = load i16, i16* %arrayidxC, align 2
+
+  call void @llvm.convergent()
+
+  %mul = mul i16 %loadB, %loadA
+  %mul1 = mul i16 %mul, %loadC
+
+  %add = add nuw nsw i64 %storemerge3, 1
+  %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+  store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }

Modified: llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll?rev=363160&r1=363159&r2=363160&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll (original)
+++ llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll Wed Jun 12 06:34:19 2019
@@ -5,6 +5,9 @@
 ; RUN:   -verify-loop-info -verify-dom-info -S < %s | \
 ; RUN:   FileCheck --check-prefix=VECTORIZE %s
 
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \
+; RUN:   -loop-accesses -analyze < %s | FileCheck %s --check-prefix=ANALYSIS
+
 ; The memcheck version of basic.ll.  We should distribute and vectorize the
 ; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B})
 ;
@@ -173,3 +176,113 @@ for.body:
 for.end:
   ret void
 }
+
+declare i32 @llvm.convergent(i32) #0
+
+; This is the same as f, and would require the same bounds
+; check. However, it is not OK to introduce new control dependencies
+; on the convergent call.
+
+; CHECK-LABEL: @f_with_convergent(
+; CHECK: call i32 @llvm.convergent
+; CHECK-NOT: call i32 @llvm.convergent
+
+; ANALYSIS: for.body:
+; ANALYSIS: Report: cannot add control dependency to convergent operation
+define void @f_with_convergent() #1 {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  %d = load i32*, i32** @D, align 8
+  %e = load i32*, i32** @E, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure an explicit request for distribution is ignored if it
+; requires possibly illegal checks.
+
+; CHECK-LABEL: @f_with_convergent_forced_distribute(
+; CHECK: call i32 @llvm.convergent
+; CHECK-NOT: call i32 @llvm.convergent
+define void @f_with_convergent_forced_distribute() #1 {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  %d = load i32*, i32** @D, align 8
+  %e = load i32*, i32** @E, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}

Modified: llvm/trunk/test/Transforms/LoopDistribute/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/basic.ll?rev=363160&r1=363159&r2=363160&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/basic.ll (original)
+++ llvm/trunk/test/Transforms/LoopDistribute/basic.ll Wed Jun 12 06:34:19 2019
@@ -18,6 +18,7 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
 
+; CHECK-LABEL: @f(
 define void @f(i32* noalias %a,
                i32* noalias %b,
                i32* noalias %c,
@@ -81,3 +82,78 @@ for.body:
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+declare i32 @llvm.convergent(i32) #0
+
+; It is OK to distribute with a convergent operation, since in each
+; new loop the convergent operation has the ssame control dependency.
+; CHECK-LABEL: @f_with_convergent(
+define void @f_with_convergent(i32* noalias %a,
+                               i32* noalias %b,
+                               i32* noalias %c,
+                               i32* noalias %d,
+                               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; Verify the two distributed loops.
+
+; CHECK: entry.split.ldist1:
+; CHECK:    br label %for.body.ldist1
+; CHECK: for.body.ldist1:
+; CHECK:    %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
+; CHECK:    br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1
+
+; CHECK: entry.split:
+; CHECK:    br label %for.body
+; CHECK: for.body:
+; CHECK:    %convergentD = call i32 @llvm.convergent(i32 %loadD)
+; CHECK:    %mulC = mul i32 %convergentD, %loadE
+; CHECK: for.end:
+
+
+; ANALYSIS: for.body:
+; ANALYSIS-NEXT: Has convergent operation in loop
+; ANALYSIS-NEXT: Report: cannot add control dependency to convergent operation
+; ANALYSIS: for.body.ldist1:
+; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop
+
+; convergent instruction happens to block vectorization
+; VECTORIZE: call i32 @llvm.convergent
+; VECTORIZE: mul i32
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }

Added: llvm/trunk/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll?rev=363160&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll Wed Jun 12 06:34:19 2019
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute \
+; RUN:   -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
+
+; Derived from crash-in-memcheck-generation.ll
+
+; Make sure the loop is distributed even with a convergent
+; op. LoopAccessAnalysis says that runtime checks are necessary, but
+; none are cross partition, so none are truly needed.
+
+define void @f(i32* %a, i32* %b, i32* noalias %c, i32* noalias %d, i32* noalias %e) #1 {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_LDIST1:%.*]]
+; CHECK:       entry.split.ldist1:
+; CHECK-NEXT:    br label [[FOR_BODY_LDIST1:%.*]]
+; CHECK:       for.body.ldist1:
+; CHECK-NEXT:    [[IND_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[ADD_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[ARRAYIDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADA_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXA_LDIST1]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADB_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXB_LDIST1]], align 4
+; CHECK-NEXT:    [[MULA_LDIST1:%.*]] = mul i32 [[LOADB_LDIST1]], [[LOADA_LDIST1]]
+; CHECK-NEXT:    [[ADD_LDIST1]] = add nuw nsw i64 [[IND_LDIST1]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD_LDIST1]]
+; CHECK-NEXT:    store i32 [[MULA_LDIST1]], i32* [[ARRAYIDXA_PLUS_4_LDIST1]], align 4
+; CHECK-NEXT:    [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[ADD_LDIST1]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4
+; CHECK-NEXT:    [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]])
+; CHECK-NEXT:    [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]]
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[IND]]
+; CHECK-NEXT:    store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i32 @llvm.convergent(i32) #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }

Modified: llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll?rev=363160&r1=363159&r2=363160&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll (original)
+++ llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll Wed Jun 12 06:34:19 2019
@@ -131,6 +131,50 @@ for.cond.cleanup:
   ret void, !dbg !34
 }
 
+; MISSED_REMARKS: /tmp/t.c:27:5: loop not distributed: use -Rpass-analysis=loop-distribute for more info
+; ANALYSIS_REMARKS: /tmp/t.c:27:5: loop not distributed: may not insert runtime check with convergent operation
+; ALWAYS: warning: /tmp/t.c:27:5: loop not distributed: failed explicitly specified loop distribution
+define void @convergent(i8* %A, i8* %B, i8* %C, i8* %D, i8* %E, i32 %N) #1 !dbg !45 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !46
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !47
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !49
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !49, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !50
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !50, !tbaa !13
+  %add = add i8 %1, %0, !dbg !51
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !57
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !52
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !53, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !54
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !54, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !55
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !55, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !56
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !57
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !58, !tbaa !13
+  call void @llvm.convergent()
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !57
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !57
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !20, !dbg !57
+
+for.cond.cleanup:
+  ret void, !dbg !58
+}
+
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
@@ -177,3 +221,17 @@ for.cond.cleanup:
 !42 = !DILocation(line: 17, column: 17, scope: !31)
 !43 = !DILocation(line: 17, column: 5, scope: !31)
 !44 = !DILocation(line: 17, column: 10, scope: !31)
+!45 = distinct !DISubprogram(name: "convergent", scope: !1, file: !1, line: 24, type: !8, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!46 = !DILocation(line: 25, column: 20, scope: !45)
+!47 = !DILocation(line: 25, column: 3, scope: !45)
+!48 = !DILocation(line: 29, column: 1, scope: !45)
+!49 = !DILocation(line: 26, column: 16, scope: !45)
+!50 = !DILocation(line: 26, column: 23, scope: !45)
+!51 = !DILocation(line: 26, column: 21, scope: !45)
+!52 = !DILocation(line: 26, column: 5, scope: !45)
+!53 = !DILocation(line: 26, column: 14, scope: !45)
+!54 = !DILocation(line: 27, column: 12, scope: !45)
+!55 = !DILocation(line: 27, column: 19, scope: !45)
+!56 = !DILocation(line: 27, column: 17, scope: !45)
+!57 = !DILocation(line: 27, column: 5, scope: !45)
+!58 = !DILocation(line: 27, column: 10, scope: !45)

Modified: llvm/trunk/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll?rev=363160&r1=363159&r2=363160&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll (original)
+++ llvm/trunk/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll Wed Jun 12 06:34:19 2019
@@ -7,7 +7,6 @@ target datalayout = "e-m:o-i64:64-f80:12
 ; not based on memory access.
 
 define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) {
-
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64
@@ -101,6 +100,7 @@ define void @f(i32* noalias %a, i32* noa
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -143,3 +143,84 @@ for.body:
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+; Can't add control dependency with convergent in loop body.
+define void @f_with_convergent(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) #1 {
+; CHECK-LABEL: @f_with_convergent(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IND1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[IND1]], 2
+; CHECK-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADA:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADB:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[INC1]] = add i32 [[IND1]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD]]
+; CHECK-NEXT:    store i32 [[MULA]], i32* [[ARRAYIDXA_PLUS_4]], align 4
+; CHECK-NEXT:    [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4
+; CHECK-NEXT:    [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4
+; CHECK-NEXT:    [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]])
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]]
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %ind1 = phi i32 [ 0, %entry ], [ %inc1, %for.body ]
+
+  %mul = mul i32 %ind1, 2
+  %mul_ext = zext i32 %mul to i64
+
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %mul_ext
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %mul_ext
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %inc1 = add i32 %ind1, 1
+
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %mul_ext
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %mul_ext
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %mul_ext
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i32 @llvm.convergent(i32) #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }