[PATCH] D130482: [LAA] Avoid non-determinism due to blocks order. PR56672

Mon Jul 25 06:29:50 PDT 2022

mkazantsev created this revision.
mkazantsev added reviewers: aeubanks, huntergr, fhahn, reames.
Herald added subscribers: mgrang, hiraditya.
Herald added a project: All.
mkazantsev requested review of this revision.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.

As test in PR56672 shows, LAA produces different results which lead to either
positive or negative vectorization decisions depending on the order of blocks
in loop. The exact reason of this is not clear to me, however this makes investigation
of related bugs extremely complex.

Current order of blocks in the loop is arbitrary. It may change, for example, if loop
info analysis is dropped and recomputed. Seems that it interferes with LAA's logic.
This patch fixates the traversal order of blocks in loops, making it RPOT.


https://reviews.llvm.org/D130482

Files:
  llvm/lib/Analysis/LoopAccessAnalysis.cpp
  llvm/test/Transforms/LoopVectorize/X86/pr56672.ll


Index: llvm/test/Transforms/LoopVectorize/X86/pr56672.ll
===================================================================

--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/X86/pr56672.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='loop(loop-rotate),loop-vectorize' -S -mcpu=skylake-avx512 -S %s | FileCheck %s
+; RUN: opt -passes='loop(loop-rotate),invalidate<loops>,loop-vectorize' -S -mcpu=skylake-avx512 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure that vectorizer's behavior based on LAA remains the same regardless
+; of the order of blocks in loop.
+define void @test()  {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32 addrspace(1)* getelementptr inbounds (i32, i32 addrspace(1)* null, i64 16), align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP81]], -5
+; CHECK-NEXT:    store i32 [[TMP92]], i32 addrspace(1)* getelementptr inbounds (i32, i32 addrspace(1)* null, i64 16), align 4
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP74:%.*]] = phi i32 addrspace(1)* [ getelementptr inbounds (i32, i32 addrspace(1)* null, i64 16), [[BB:%.*]] ], [ [[TMP7:%.*]], [[BB4:%.*]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = phi i64 [ 16, [[BB]] ], [ [[TMP5:%.*]], [[BB4]] ]
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 1, i32 addrspace(1)* [[TMP74]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = add nuw i64 [[TMP53]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP]], undef
+; CHECK-NEXT:    br i1 [[TMP3]], label [[BB4]], label [[BB11:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[TMP5]] = phi i64 [ [[TMP]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ [[TMP53]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP7]] = getelementptr inbounds i32, i32 addrspace(1)* null, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32 addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], -5
+; CHECK-NEXT:    store i32 [[TMP9]], i32 addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT:    br i1 false, label [[BB10:%.*]], label [[BB1]]
+; CHECK:       bb10:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb11:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br label %bb4
+
+bb1:                                              ; preds = %bb4
+  br label %bb2
+
+bb2:                                              ; preds = %bb1
+  store i32 1, i32 addrspace(1)* %tmp7, align 4
+  %tmp = add nuw i64 %tmp5, 1
+  %tmp3 = icmp ult i64 %tmp, undef
+  br i1 %tmp3, label %bb4, label %bb11
+
+bb4:                                              ; preds = %bb2, %bb
+  %tmp5 = phi i64 [ %tmp, %bb2 ], [ 16, %bb ]
+  %tmp6 = phi i64 [ %tmp5, %bb2 ], [ 15, %bb ]
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* null, i64 %tmp5
+  %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4
+  %tmp9 = add i32 %tmp8, -5
+  store i32 %tmp9, i32 addrspace(1)* %tmp7, align 4
+  br i1 false, label %bb10, label %bb1
+
+bb10:                                             ; preds = %bb4
+  unreachable
+
+bb11:                                             ; preds = %bb2
+  ret void
+}
Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -2127,8 +2128,11 @@
       EnableMemAccessVersioning &&
       !TheLoop->getHeader()->getParent()->hasOptSize();
 
-  // For each block.
-  for (BasicBlock *BB : TheLoop->blocks()) {
+  // Traverse blocks in deterministic order, regardless of their storage in
+  // the loop info.
+  LoopBlocksRPO RPOT(TheLoop);
+  RPOT.perform(LI);
+  for (BasicBlock *BB : RPOT) {
     // Scan the BB and collect legal loads and stores. Also detect any
     // convergent instructions.
     for (Instruction &I : *BB) {


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D130482.447307.patch
Type: text/x-patch
Size: 4359 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220725/358a2adf/attachment.bin>