[LLVMdev] Improving loop vectorizer support for loops with a volatile iteration variable

Mon Aug 3 14:33:12 PDT 2015

Hi,

I discussed the issue offline with Hal, and would like to clarify what is
exactly going on, what are trade-offs for different solutions, and ask for
more feedback on my proposed solution (http://reviews.llvm.org/D11728). I
will use the example from Hal's post:

void foo2(float * restrict x, float * restrict y, float * restrict z) {
  for (volatile int i = 0; i < 1000; ++i) {
    for (int j = 0; j < 1600; ++j) {
      x[j] = y[j] + z[j];
    }
  }
}

IR after the first loop simplify: A preheader is created.

; Function Attrs: nounwind
define void @foo2(float* noalias nocapture %x, float* noalias nocapture
readonly %y, float* noalias nocapture readonly %z) #0 {
entry:
  %i = alloca i32, align 4
  tail call void @llvm.dbg.value(metadata float* %x, i64 0, metadata !11,
metadata !25), !dbg !26
  tail call void @llvm.dbg.value(metadata float* %y, i64 0, metadata !12,
metadata !25), !dbg !27
  tail call void @llvm.dbg.value(metadata float* %z, i64 0, metadata !13,
metadata !25), !dbg !28
  %i.0.i.0..sroa_cast = bitcast i32* %i to i8*
  call void @llvm.lifetime.start(i64 4, i8* %i.0.i.0..sroa_cast)
  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14,
metadata !25), !dbg !29
  store volatile i32 0, i32* %i, align 4, !dbg !29
  br label %for.cond, !dbg !30

for.cond:                                         ; preds =
%for.cond.cleanup.3, %entry
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0. = load volatile i32, i32* %i, align 4, !dbg !31
  %cmp = icmp slt i32 %i.0.i.0., 1000, !dbg !34
  br i1 %cmp, label %for.cond.1.preheader, label
%for.cond.cleanup, !dbg !35

for.cond.1.preheader:                             ; preds = %for.cond
  br label %for.cond.1, !dbg !36

for.cond.cleanup:                                 ; preds = %for.cond
  call void @llvm.lifetime.end(i64 4, i8* %i.0.i.0..sroa_cast)
  ret void, !dbg !38

for.cond.1:                                       ; preds =
%for.cond.1.preheader, %for.body.4
  %j.0 = phi i32 [ %inc, %for.body.4 ], [ 0, %for.cond.1.preheader ]
  %cmp2 = icmp slt i32 %j.0, 1600, !dbg !36
  br i1 %cmp2, label %for.body.4, label %for.cond.cleanup.3, !dbg !39

for.cond.cleanup.3:                               ; preds = %for.cond.1
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0.17 = load volatile i32, i32* %i, align 4, !dbg !40
  %inc10 = add nsw i32 %i.0.i.0.17, 1, !dbg !40
  tail call void @llvm.dbg.value(metadata i32 %inc10, i64 0, metadata !14,
metadata !25), !dbg !29
  store volatile i32 %inc10, i32* %i, align 4, !dbg !40
  br label %for.cond, !dbg !41

for.body.4:                                       ; preds = %for.cond.1
  %idxprom = sext i32 %j.0 to i64, !dbg !42
  %arrayidx = getelementptr inbounds float, float* %y, i64
%idxprom, !dbg !42
  %0 = load float, float* %arrayidx, align 4, !dbg !42, !tbaa !44
  %arrayidx6 = getelementptr inbounds float, float* %z, i64
%idxprom, !dbg !48
  %1 = load float, float* %arrayidx6, align 4, !dbg !48, !tbaa !44
  %add = fadd float %0, %1, !dbg !49
  %arrayidx8 = getelementptr inbounds float, float* %x, i64
%idxprom, !dbg !50
  store float %add, float* %arrayidx8, align 4, !dbg !51, !tbaa !44
  %inc = add nsw i32 %j.0, 1, !dbg !52
  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !18,
metadata !25), !dbg !53
  br label %for.cond.1, !dbg !54
}

IR after loop rotation: After loop rotation, a rotated preheader
(for.cond.1.preheader.lr.ph) is created. A test for (i < 1000) is added at
the end of "entry" block. If true, the control jumps unconditionally to
"for.body.4" through "for.cond.1.preheader.lr.ph" and
"for.cond.1.preheader". You can see that these two blocks
("for.cond.1.preheader.lr.ph" and "for.cond.1.preheader") are practically
empty, and they will get eliminated later by Jump Threading and/or
Simplify-the-CFG. *IF* the outer loop has a non-volatile induction
variable, the loop will not be rotated in the first place as
"for.cond.1.preheader" has a PHI node for "i", and these blocks will not be
eliminated.

; Function Attrs: nounwind
define void @foo2(float* noalias nocapture %x, float* noalias nocapture
readonly %y, float* noalias nocapture readonly %z) #0 {
entry:
  %i = alloca i32, align 4
  tail call void @llvm.dbg.value(metadata float* %x, i64 0, metadata !11,
metadata !25), !dbg !26
  tail call void @llvm.dbg.value(metadata float* %y, i64 0, metadata !12,
metadata !25), !dbg !27
  tail call void @llvm.dbg.value(metadata float* %z, i64 0, metadata !13,
metadata !25), !dbg !28
  %i.0.i.0..sroa_cast = bitcast i32* %i to i8*
  call void @llvm.lifetime.start(i64 4, i8* %i.0.i.0..sroa_cast)
  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14,
metadata !25), !dbg !29
  store volatile i32 0, i32* %i, align 4, !dbg !29
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0..21 = load volatile i32, i32* %i, align 4, !dbg !30
  %cmp.22 = icmp slt i32 %i.0.i.0..21, 1000, !dbg !33
  br i1 %cmp.22, label %for.cond.1.preheader.lr.ph, label
%for.cond.cleanup, !dbg !34

for.cond.1.preheader.lr.ph:                       ; preds = %entry
  br label %for.cond.1.preheader, !dbg !34

for.cond.1.preheader:                             ; preds =
%for.cond.1.preheader.lr.ph, %for.cond.cleanup.3
  br label %for.body.4, !dbg !35

for.cond.for.cond.cleanup_crit_edge:              ; preds =
%for.cond.cleanup.3
  br label %for.cond.cleanup, !dbg !34

for.cond.cleanup:                                 ; preds =
%for.cond.for.cond.cleanup_crit_edge, %entry
  call void @llvm.lifetime.end(i64 4, i8* %i.0.i.0..sroa_cast)
  ret void, !dbg !36

for.cond.cleanup.3:                               ; preds = %for.body.4
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0.17 = load volatile i32, i32* %i, align 4, !dbg !37
  %inc10 = add nsw i32 %i.0.i.0.17, 1, !dbg !37
  tail call void @llvm.dbg.value(metadata i32 %inc10, i64 0, metadata !14,
metadata !25), !dbg !29
  store volatile i32 %inc10, i32* %i, align 4, !dbg !37
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0. = load volatile i32, i32* %i, align 4, !dbg !30
  %cmp = icmp slt i32 %i.0.i.0., 1000, !dbg !33
  br i1 %cmp, label %for.cond.1.preheader, label
%for.cond.for.cond.cleanup_crit_edge, !dbg !34

for.body.4:                                       ; preds =
%for.cond.1.preheader, %for.body.4
  %j.020 = phi i32 [ 0, %for.cond.1.preheader ], [ %inc, %for.body.4 ]
  %idxprom = sext i32 %j.020 to i64, !dbg !38
  %arrayidx = getelementptr inbounds float, float* %y, i64
%idxprom, !dbg !38
  %0 = load float, float* %arrayidx, align 4, !dbg !38, !tbaa !41
  %arrayidx6 = getelementptr inbounds float, float* %z, i64
%idxprom, !dbg !45
  %1 = load float, float* %arrayidx6, align 4, !dbg !45, !tbaa !41
  %add = fadd float %0, %1, !dbg !46
  %arrayidx8 = getelementptr inbounds float, float* %x, i64
%idxprom, !dbg !47
  store float %add, float* %arrayidx8, align 4, !dbg !48, !tbaa !41
  %inc = add nsw i32 %j.020, 1, !dbg !49
  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !18,
metadata !25), !dbg !50
  %cmp2 = icmp slt i32 %inc, 1600, !dbg !51
  br i1 %cmp2, label %for.body.4, label %for.cond.cleanup.3, !dbg !35

After Jump Threading: "for.cond.1.preheader.lr.ph" and
"for.cond.1.preheader" are merged into "for.body.4" by
TryToSimplifyUnconditionalBranchFromEmptyBlock() in
Transforms/Utils/Local.cpp. Now "for.body.4" has three incoming edges (two
backedges).

; Function Attrs: nounwind
define void @foo2(float* noalias nocapture %x, float* noalias nocapture
readonly %y, float* noalias nocapture readonly %z) #0 {
entry:
  %i = alloca i32, align 4
  tail call void @llvm.dbg.value(metadata float* %x, i64 0, metadata !11,
metadata !25), !dbg !26
  tail call void @llvm.dbg.value(metadata float* %y, i64 0, metadata !12,
metadata !25), !dbg !27
  tail call void @llvm.dbg.value(metadata float* %z, i64 0, metadata !13,
metadata !25), !dbg !28
  %i.0.i.0..sroa_cast = bitcast i32* %i to i8*
  call void @llvm.lifetime.start(i64 4, i8* %i.0.i.0..sroa_cast)
  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14,
metadata !25), !dbg !29
  store volatile i32 0, i32* %i, align 4, !dbg !29
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0..21 = load volatile i32, i32* %i, align 4, !dbg !30
  %cmp.22 = icmp slt i32 %i.0.i.0..21, 1000, !dbg !33
  br i1 %cmp.22, label %for.body.4, label %for.cond.cleanup, !dbg !34

for.cond.cleanup:                                 ; preds =
%for.cond.cleanup.3, %entry
  call void @llvm.lifetime.end(i64 4, i8* %i.0.i.0..sroa_cast)
  ret void, !dbg !35

for.cond.cleanup.3:                               ; preds = %for.body.4
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0.17 = load volatile i32, i32* %i, align 4, !dbg !36
  %inc10 = add nsw i32 %i.0.i.0.17, 1, !dbg !36
  tail call void @llvm.dbg.value(metadata i32 %inc10, i64 0, metadata !14,
metadata !25), !dbg !29
  store volatile i32 %inc10, i32* %i, align 4, !dbg !36
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0. = load volatile i32, i32* %i, align 4, !dbg !30
  %cmp = icmp slt i32 %i.0.i.0., 1000, !dbg !33
  br i1 %cmp, label %for.body.4, label %for.cond.cleanup, !dbg !34

for.body.4:                                       ; preds =
%for.cond.cleanup.3, %entry, %for.body.4
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.4 ], [ 0, %entry ],
[ 0, %for.cond.cleanup.3 ]
  %arrayidx = getelementptr inbounds float, float* %y, i64
%indvars.iv, !dbg !37
  %0 = load float, float* %arrayidx, align 4, !dbg !37, !tbaa !40
  %arrayidx6 = getelementptr inbounds float, float* %z, i64
%indvars.iv, !dbg !44
  %1 = load float, float* %arrayidx6, align 4, !dbg !44, !tbaa !40
  %add = fadd float %0, %1, !dbg !45
  %arrayidx8 = getelementptr inbounds float, float* %x, i64
%indvars.iv, !dbg !46
  store float %add, float* %arrayidx8, align 4, !dbg !47, !tbaa !40
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !48
  %exitcond = icmp eq i64 %indvars.iv.next, 1600, !dbg !48
  br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4, !dbg !48

After another loop simplify: Loop simplify tries to separate out nested
loops but fails to do so with this loop since it does not has a PHI node
for the outer loop variable. Instead, it creates a backedge block.

for.cond.cleanup.3:                               ; preds = %for.body.4
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0.17 = load volatile i32, i32* %i, align 4, !dbg !39
  %inc10 = add nsw i32 %i.0.i.0.17, 1, !dbg !39
  tail call void @llvm.dbg.value(metadata i32 %inc10, i64 0, metadata !14,
metadata !25), !dbg !29
  store volatile i32 %inc10, i32* %i, align 4, !dbg !39
  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14,
metadata !25), !dbg !29
  %i.0.i.0. = load volatile i32, i32* %i, align 4, !dbg !30
  %cmp = icmp slt i32 %i.0.i.0., 1000, !dbg !33
  br i1 %cmp, label %for.body.4.backedge, label
%for.cond.cleanup.loopexit, !dbg !34

for.body.4:                                       ; preds =
%for.body.4.backedge, %for.body.4.preheader
  %indvars.iv = phi i64 [ 0, %for.body.4.preheader ], [ %indvars.iv.be,
%for.body.4.backedge ]
  %arrayidx = getelementptr inbounds float, float* %y, i64
%indvars.iv, !dbg !35
  %0 = load float, float* %arrayidx, align 4, !dbg !35, !tbaa !40
  %arrayidx6 = getelementptr inbounds float, float* %z, i64
%indvars.iv, !dbg !44
  %1 = load float, float* %arrayidx6, align 4, !dbg !44, !tbaa !40
  %add = fadd float %0, %1, !dbg !45
  %arrayidx8 = getelementptr inbounds float, float* %x, i64
%indvars.iv, !dbg !46
  store float %add, float* %arrayidx8, align 4, !dbg !47, !tbaa !40
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !48
  %exitcond = icmp eq i64 %indvars.iv.next, 1600, !dbg !48
  br i1 %exitcond, label %for.cond.cleanup.3, label
%for.body.4.backedge, !dbg !48

for.body.4.backedge:                              ; preds = %for.body.4,
%for.cond.cleanup.3
  %indvars.iv.be = phi i64 [ %indvars.iv.next, %for.body.4 ], [ 0,
%for.cond.cleanup.3 ]
  br label %for.body.4x

LLVM  loop vectorizer rejects to vectorize any loop for which a loop latch
(for.body.4.backedge) is different from a loop exiting block
(for.cond.cleanup.3). The loop vectorizer can assume that all instructions
in the loop are executed the same number of times with the test.

I believe a fix is in order in one way or another because the example is
simple and common enough and vectorized by other compilers. We may approach
it by either (1) preventing loops from being collapsed in the first place
or (2) teaching loop vectorizer to handle collapsed loops. For (2), we may
need to allow loop vectorizer to forego the assumption and handle the loop
as it is. The assumption seems fundamental to many of the vectorization
algorithms, so it will require extensive updates or may end up with
reverting the loop back to a properly nested form. The downside of (1) is
that it may slow down common optimization passes that are repeatedly
executed before vectorization.

My patch (http://reviews.llvm.org/D11728) is a prototype fix for (1) that
modifies Jump Threading and Simplify-the-CFG to not eliminate an empty loop
header BB even when the loop does not have a PHI node for its induction
variable. The details can be found at http://reviews.llvm.org/D11728. I
would welcome and appreciate any comments or feedback.

Best,
Hyojin

From:	Hal Finkel <hfinkel at anl.gov>
To:	Chandler Carruth <chandlerc at google.com>
Cc:	llvmdev at cs.uiuc.edu
Date:	07/16/2015 03:19 AM
Subject:	Re: [LLVMdev] Improving loop vectorizer support for loops with
            a volatile iteration variable
Sent by:	llvmdev-bounces at cs.uiuc.edu

 From: "Hal Finkel" <hfinkel at anl.gov>
 To: "Chandler Carruth" <chandlerc at google.com>
 Cc: llvmdev at cs.uiuc.edu
 Sent: Thursday, July 16, 2015 1:58:02 AM
 Subject: Re: [LLVMdev] Improving loop vectorizer support for loops with a
 volatile iteration variable

 From: "Hal Finkel" <hfinkel at anl.gov>
 To: "Chandler Carruth" <chandlerc at google.com>
 Cc: llvmdev at cs.uiuc.edu
 Sent: Thursday, July 16, 2015 1:46:42 AM
 Subject: Re: [LLVMdev] Improving loop vectorizer support for loops with a
 volatile iteration variable

  From: "Chandler Carruth" <chandlerc at google.com>
  To: "Hal Finkel" <hfinkel at anl.gov>
  Cc: "Hyojin Sung" <hsung at us.ibm.com>, llvmdev at cs.uiuc.edu
  Sent: Thursday, July 16, 2015 1:06:03 AM
  Subject: Re: [LLVMdev] Improving loop vectorizer support for loops with a
  volatile iteration variable

  On Wed, Jul 15, 2015 at 6:36 PM Hal Finkel <hfinkel at anl.gov> wrote:

    From: "Chandler Carruth" <chandlerc at google.com>
    To: "Hyojin Sung" <hsung at us.ibm.com>, llvmdev at cs.uiuc.edu
    Sent: Wednesday, July 15, 2015 7:34:54 PM
    Subject: Re: [LLVMdev] Improving loop vectorizer support for loops with
    a volatile iteration variable

    On Wed, Jul 15, 2015 at 12:55 PM Hyojin Sung <hsung at us.ibm.com> wrote:
      Hi all,

      I would like to propose an improvement of the “almost dead” block
      elimination in Transforms/Local.cpp so that it will preserve the
      canonical loop form for loops with a volatile iteration variable.

      *** Problem statement
      Nested loops in LCALS Subset B (https://codesign.llnl.gov/LCALS.php)
      are not vectorized with LLVM -O3 because the LLVM loop vectorizer
      fails the test whether the loop latch and exiting block of a loop is
      the same. The loops are vectorizable, and get vectorized with LLVM
      -O2

    I would be interested to know why -O2 succeeds here.

      and also with other commercial compilers (icc, xlc).

      *** Details
      These loops ended up with different loop latch and exiting block
      after a series of optimizations including loop unswitching, jump
      threading, simplify-the-CFG, and loop simplify. The fundamental
      problem here is that the above optimizations cannot recognize a loop
      with a volatile iteration variable and do not preserve its canonical
      loop structure.

    Ok, meta-level question first:

    Why do we care about performance of loops with a volatile iteration
    variable?
   I don't think we do, however, I think that misses the point. In this
   case, the volatile iteration variable is just an easy way to expose this
   problem that we have with nested loop canonicalization and the
   vectorizer. To be specific:

   This we vectorizer just fine:

   void foo1(float * restrict x, float * restrict y, float * restrict z) {
     for (int i = 0; i < 1000; ++i) {
       for (int j = 0; j < 1600; ++j) {
         x[j] = y[j] + z[j];
       }
     }
   }

   And, indeed, this we don't (the only change is adding volatile on i):

   void foo2(float * restrict x, float * restrict y, float * restrict z) {
     for (volatile int i = 0; i < 1000; ++i) {
       for (int j = 0; j < 1600; ++j) {
         x[j] = y[j] + z[j];
       }
     }
   }

   However, this we don't either, and that's a big problem:

   int done(float *x, float *y, float *z);
   void foo3(float * restrict x, float * restrict y, float * restrict z) {
     while (!done(x, y, z)) {
       for (int j = 0; j < 1600; ++j) {
         x[j] = y[j] + z[j];
       }
     }
   }

   And the underlying reason is the same. The IR at the point in time when
   the loop vectorizer runs looks like this:

   define void @foo3(float* noalias %x, float* noalias %y, float* noalias
   %z) #0 {
   entry:
     %call.14 = tail call i32 @done(float* %x, float* %y, float* %z) #1
     %lnot.15 = icmp eq i32 %call.14, 0
     br i1 %lnot.15, label %for.body.preheader, label %while.end

   for.body.preheader:                               ; preds = %entry
     br label %for.body

   while.cond.loopexit:                              ; preds = %for.body
     %call = tail call i32 @done(float* %x, float* %y, float* %z) #1
     %lnot = icmp eq i32 %call, 0
     br i1 %lnot, label %for.body.backedge, label %while.end.loopexit

   for.body:                                         ; preds =
   %for.body.backedge, %for.body.preheader
     %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.be,
   %for.body.backedge ]
     ...
     %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
     %exitcond = icmp eq i64 %indvars.iv.next, 1600
     br i1 %exitcond, label %while.cond.loopexit, label %for.body.backedge

   for.body.backedge:                                ; preds = %for.body,
   %while.cond.loopexit
     %indvars.iv.be = phi i64 [ %indvars.iv.next, %for.body ], [ 0,
   %while.cond.loopexit ]
     br label %for.body

   while.end.loopexit:                               ; preds =
   %while.cond.loopexit
     br label %while.end

   while.end:                                        ; preds =
   %while.end.loopexit, %entry
     ret void
   }

   and we can currently only vectorize loops where the loop latch is also
   the loop's exiting block. In this case, as in the case with the volatile
   variable, vectorization is blocked by this constraint (here the backedge
   is from the terminator of %for.body.backedge, but the loop exiting block
   is %for.body). The check in the vectorizer is explicit:

     // We only handle bottom-tested loops, i.e. loop in which the
   condition is
     // checked at the end of each iteration. With that we can assume that
   all
     // instructions in the loop are executed the same number of times.
     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
       ...

  Thanks for the detailed explanation. This makes much more sense why we
  need to handle it. I think its much better to look at nested loops of
  this form than anything to do with volatile -- the latter is too prone to
  other random optimizations turning off.

  Regarding this problem, it would be interesting to know based on this
  explanation what the desired fix would be. I see at least these options:

  1) Canonicalize loops harder to make them look the way the vectorizer
  wants. If this can be done without causing significant problems, it seems
  likely the best approach.
 I agree. In this case, we could certainly fold the trivial
 %for.body.backedge block into %for.body, meaning transforming this:

 for.body.backedge:                                ; preds =
 %while.cond.loopexit, %for.body
   %indvars.iv.be = phi i64 [ %indvars.iv.next, %for.body ], [ 0,
 %while.cond.loopexit ]
   br label %for.body

 for.body:                                         ; preds =
 %for.body.backedge, %for.body.preheader
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.be,
 %for.body.backedge ]
   ...
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1600
   br i1 %exitcond, label %while.cond.loopexit, label %for.body.backedge

 into this:

 for.body:                                         ; preds =
 %for.body.backedge, %for.body.preheader
   %indvars.iv.be = phi i64 [ %indvars.iv.next, %for.body ], [ 0,
 %while.cond.loopexit ]
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.be,
 %for.body ]
   ...
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1600
   br i1 %exitcond, label %while.cond.loopexit, label %for.body

 and this seems pretty trivial when %for.body.backedge is completely empty
 (as in this case), but if it had non-PHI instructions in it on which the
 existing PHIs in %for.body depended, then maybe this becomes less trivial?
 Although, based on what I said below, the case with instructions there we
 can't currently vectorize anyway for more-fundamental reasons.

  -Hal

Also worth pointing out that SimplifyCFG does this exact transformation.
The vectorizer never sees it, however, because LoopSimplify prefers this
form with the separate backedge block that the vectorizer can't handle.

 -Hal

  2) Teach the vectorizer to vectorize without this constraint by instead
  establishing the actual invariant it cares about.
 It really cares that there's no code that comes in between the latch and
 the exit, because such code is not really part of the loop (it only runs
 once), or code in between the exit and the latch (because such code runs
 in one fewer iterations than the code before the exit). At least nothing
 with side effects I presume.

  -Hal

  Maybe there is another strategy?

    -Hal
    That seems both counter-intuitive and unlikely to be a useful goal. We
    simply don't optimize volatile operations well in *any* part of the
    optimizer, and I'm not sure why we need to start trying to fix that.
    This seems like an irreparably broken benchmark, but perhaps there is a
    motivation I don't yet see.

    Assuming that sufficient motivation arises to try to fix this, see my
    comments below:

      (1) Loop unswitching generates several empty placeholder BBs only
      with PHI nodes after separating out a shorter path with no inner loop
      execution from a standard path.

      (2) Jump threading and simplify-the-CFG passes independently calls
      TryToSimplifyUnconditionalBranchFromEmptyBlock() in
      Transforms/Utils/Local.cpp to get rid of almost empty BBs.

      (3) TryToSimplifyUnconditionalBranchFromEmtpyBlock() eliminates the
      placeholder BBs after loop unswitching and merges them into
      subsequent blocks including the header of the inner loop. Before
      eliminating the blocks, the function checks if the block is a loop
      header by looking at its PHI nodes so that it can be saved, but the
      test fails with the loops with a volatile iteration variable.

    Why does this fail for a volatile iteration variable but not for a
    non-volatile one? I think understanding that will be key to
    understanding how it should be fixed.

    _______________________________________________
    LLVM Developers mailing list
    LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
    http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

   --
   Hal Finkel
   Assistant Computational Scientist
   Leadership Computing Facility
   Argonne National Laboratory

 --
 Hal Finkel
 Assistant Computational Scientist
 Leadership Computing Facility
 Argonne National Laboratory

 _______________________________________________
 LLVM Developers mailing list
 LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
 http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

 --
 Hal Finkel
 Assistant Computational Scientist
 Leadership Computing Facility
 Argonne National Laboratory

 _______________________________________________
 LLVM Developers mailing list
 LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
 http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

--
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory_______________________________________________
LLVM Developers mailing list
LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150803/b069d698/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: graycol.gif
Type: image/gif
Size: 105 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150803/b069d698/attachment.gif>