[llvm] LAA: pre-commit tests for stride-versioning (PR #97570)

Tue Aug 20 03:24:33 PDT 2024

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/97570

>From 86c6ad69ad21332a315f624052abc716d547a214 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 3 Jul 2024 13:48:48 +0100
Subject: [PATCH 1/3] LAA: add pre-commit tests for stride-versioning

Add tests for when the Stride is unknown and equal to TC, with different
kinds of casts. In these cases, LAA should not speculate on Stride.
---
 .../LoopAccessAnalysis/symbolic-stride.ll     | 181 ++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 7c1b11e22aef24..b14eb6d6aa3dbf 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -310,3 +310,184 @@ loop:
 exit:                                          ; preds = %loop
   ret void
 }
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count. If we speculate Stride==1, it implies that the loop
+; will iterate no more than a single iteration.
+define i32 @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %i, i32 %j)  {
+; CHECK-LABEL: 'unknown_stride_equalto_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
+  %k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul i32 %k.09, %N
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %add1 = add nsw i32 %tmp.010, %conv
+  %inc = add nuw i32 %k.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, but the TC is zero-extended from a narrower type.
+define i32 @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+; CHECK-LABEL: 'unknown_stride_equalto_zext_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %conv = zext i16 %N to i32
+  %cmp11 = icmp eq i16 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, but the TC is sign-extended from a narrower type.
+define i32 @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+; CHECK-LABEL: 'unknown_stride_equalto_sext_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp11 = icmp eq i16 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, but the TC is truncated from a wider type.
+define i32 @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+; CHECK-LABEL: 'unknown_stride_equalto_trunc_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %conv = trunc i64 %N to i32
+  %cmp11 = icmp eq i64 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}

>From e2eb85b348dcb504e33cf0f0e0d1d89214b15619 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Mon, 15 Jul 2024 16:55:02 +0100
Subject: [PATCH 2/3] LAA/symbolic-stride: clean up test

---
 .../LoopAccessAnalysis/symbolic-stride.ll     | 156 +++++++-----------
 1 file changed, 64 insertions(+), 92 deletions(-)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index b14eb6d6aa3dbf..834251cf23c58d 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -314,9 +314,9 @@ exit:                                          ; preds = %loop
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count. If we speculate Stride==1, it implies that the loop
 ; will iterate no more than a single iteration.
-define i32 @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %i, i32 %j)  {
+define i32 @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %j)  {
 ; CHECK-LABEL: 'unknown_stride_equalto_tc'
-; CHECK-NEXT:    for.body:
+; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
@@ -328,40 +328,33 @@ define i32 @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %i, i32 %j)  {
 ; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
-  %cmp8 = icmp eq i32 %N, 0
-  br i1 %cmp8, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
+  %cmp = icmp eq i32 %N, 0
+  br i1 %cmp, label %exit, label %loop
 
-for.body:
-  %tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
-  %k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %mul = mul i32 %k.09, %N
+loop:
+  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul = mul i32 %iv, %N
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
   %0 = load i16, ptr %arrayidx, align 2
   %conv = sext i16 %0 to i32
-  %add1 = add nsw i32 %tmp.010, %conv
-  %inc = add nuw i32 %k.09, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  %add1.lcssa = phi i32 [ %add1, %for.body ]
-  br label %for.end
-
-for.end:
-  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
-  ret i32 %tmp.0.lcssa
+  %add1.next = add nsw i32 %add1, %conv
+  %iv.next = add nuw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %N
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  ret i32 %ret
 }
 
 
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count, but the TC is zero-extended from a narrower type.
-define i32 @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+define i32 @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %j) {
 ; CHECK-LABEL: 'unknown_stride_equalto_zext_tc'
-; CHECK-NEXT:    for.body:
+; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
@@ -374,39 +367,32 @@ define i32 @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %
 ;
 entry:
   %conv = zext i16 %N to i32
-  %cmp11 = icmp eq i16 %N, 0
-  br i1 %cmp11, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
+  %cmp = icmp eq i16 %N, 0
+  br i1 %cmp, label %exit, label %loop
 
-for.body:
-  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
-  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %mul = mul nuw i32 %k.012, %conv
+loop:
+  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul = mul nuw i32 %iv, %conv
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
   %0 = load i16, ptr %arrayidx, align 2
   %conv3 = sext i16 %0 to i32
-  %add4 = add nsw i32 %tmp.013, %conv3
-  %inc = add nuw nsw i32 %k.012, 1
-  %exitcond = icmp eq i32 %inc, %conv
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  %add4.lcssa = phi i32 [ %add4, %for.body ]
-  br label %for.end
-
-for.end:
-  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
-  ret i32 %tmp.0.lcssa
+  %add1.next = add nsw i32 %add1, %conv3
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %conv
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  ret i32 %ret
 }
 
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count, but the TC is sign-extended from a narrower type.
-define i32 @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+define i32 @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %j) {
 ; CHECK-LABEL: 'unknown_stride_equalto_sext_tc'
-; CHECK-NEXT:    for.body:
+; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
@@ -419,39 +405,32 @@ define i32 @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %i, i32 %
 ;
 entry:
   %conv = sext i16 %N to i32
-  %cmp11 = icmp eq i16 %N, 0
-  br i1 %cmp11, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
+  %cmp = icmp eq i16 %N, 0
+  br i1 %cmp, label %exit, label %loop
 
-for.body:
-  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
-  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %mul = mul nuw i32 %k.012, %conv
+loop:
+  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul = mul nuw i32 %iv, %conv
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
   %0 = load i16, ptr %arrayidx, align 2
   %conv3 = sext i16 %0 to i32
-  %add4 = add nsw i32 %tmp.013, %conv3
-  %inc = add nuw nsw i32 %k.012, 1
-  %exitcond = icmp eq i32 %inc, %conv
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  %add4.lcssa = phi i32 [ %add4, %for.body ]
-  br label %for.end
-
-for.end:
-  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
-  ret i32 %tmp.0.lcssa
+  %add1.next = add nsw i32 %add1, %conv3
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %conv
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  ret i32 %ret
 }
 
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count, but the TC is truncated from a wider type.
-define i32 @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+define i32 @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %j) {
 ; CHECK-LABEL: 'unknown_stride_equalto_trunc_tc'
-; CHECK-NEXT:    for.body:
+; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
@@ -464,30 +443,23 @@ define i32 @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %i, i32
 ;
 entry:
   %conv = trunc i64 %N to i32
-  %cmp11 = icmp eq i64 %N, 0
-  br i1 %cmp11, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
+  %cmp = icmp eq i64 %N, 0
+  br i1 %cmp, label %exit, label %loop
 
-for.body:
-  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
-  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %mul = mul nuw i32 %k.012, %conv
+loop:
+  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %mul = mul nuw i32 %iv, %conv
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
   %0 = load i16, ptr %arrayidx, align 2
   %conv3 = sext i16 %0 to i32
-  %add4 = add nsw i32 %tmp.013, %conv3
-  %inc = add nuw nsw i32 %k.012, 1
-  %exitcond = icmp eq i32 %inc, %conv
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:
-  %add4.lcssa = phi i32 [ %add4, %for.body ]
-  br label %for.end
-
-for.end:
-  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
-  ret i32 %tmp.0.lcssa
+  %add1.next = add nsw i32 %add1, %conv3
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %conv
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
+  ret i32 %ret
 }

>From 70874e1b39f4512d1a474c55b757a8f404335f0b Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Tue, 20 Aug 2024 11:22:37 +0100
Subject: [PATCH 3/3] LAA/symbolic-stride: make tests meaningful

---
 .../LoopAccessAnalysis/symbolic-stride.ll     | 134 ++++++++++++------
 1 file changed, 93 insertions(+), 41 deletions(-)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 834251cf23c58d..a3861d6663032c 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -314,152 +314,204 @@ exit:                                          ; preds = %loop
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count. If we speculate Stride==1, it implies that the loop
 ; will iterate no more than a single iteration.
-define i32 @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %j)  {
+define void @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %j)  {
 ; CHECK-LABEL: 'unknown_stride_equalto_tc'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP5:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP5]]:
+; CHECK-NEXT:          (Low: %A High: (4 + %A))
+; CHECK-NEXT:            Member: %A
+; CHECK-NEXT:        Group [[GRP6]]:
+; CHECK-NEXT:          (Low: (((2 * (sext i32 %j to i64))<nsw> + %B) umin ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + %N) to i64) * (sext i32 %N to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64))<nsw> + %B) umax ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + %N) to i64) * (sext i32 %N to i64)) + %B))))
+; CHECK-NEXT:            Member: {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (sext i32 %N to i64))<nsw>}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%j,+,%N}<%loop> Added Flags: <nssw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add:
+; CHECK-NEXT:        ((2 * (sext i32 {%j,+,%N}<%loop> to i64))<nsw> + %B)
+; CHECK-NEXT:        --> {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (sext i32 %N to i64))<nsw>}<%loop>
 ;
 entry:
   %cmp = icmp eq i32 %N, 0
   br i1 %cmp, label %exit, label %loop
 
 loop:
-  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %mul = mul i32 %iv, %N
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
-  %0 = load i16, ptr %arrayidx, align 2
-  %conv = sext i16 %0 to i32
-  %add1.next = add nsw i32 %add1, %conv
+  %load = load i16, ptr %arrayidx
+  %sext = sext i16 %load to i32
+  store i32 %sext, ptr %A
   %iv.next = add nuw i32 %iv, 1
   %exitcond = icmp eq i32 %iv.next, %N
   br i1 %exitcond, label %exit, label %loop
 
 exit:
-  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
-  ret i32 %ret
+  ret void
 }
 
 
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count, but the TC is zero-extended from a narrower type.
-define i32 @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %j) {
+define void @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %j) {
 ; CHECK-LABEL: 'unknown_stride_equalto_zext_tc'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP7:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        Against group ([[GRP8:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP7]]:
+; CHECK-NEXT:          (Low: %A High: (4 + %A))
+; CHECK-NEXT:            Member: %A
+; CHECK-NEXT:        Group [[GRP8]]:
+; CHECK-NEXT:          (Low: (((2 * (sext i32 %j to i64))<nsw> + %B) umin ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + (zext i16 %N to i32))<nsw> to i64) * (zext i16 %N to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64))<nsw> + %B) umax ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + (zext i16 %N to i32))<nsw> to i64) * (zext i16 %N to i64)) + %B))))
+; CHECK-NEXT:            Member: {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (zext i16 %N to i64))<nuw><nsw>}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%j,+,(zext i16 %N to i32)}<nw><%loop> Added Flags: <nssw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add:
+; CHECK-NEXT:        ((2 * (sext i32 {%j,+,(zext i16 %N to i32)}<nw><%loop> to i64))<nsw> + %B)
+; CHECK-NEXT:        --> {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (zext i16 %N to i64))<nuw><nsw>}<%loop>
 ;
 entry:
-  %conv = zext i16 %N to i32
+  %N.ext = zext i16 %N to i32
   %cmp = icmp eq i16 %N, 0
   br i1 %cmp, label %exit, label %loop
 
 loop:
-  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-  %mul = mul nuw i32 %iv, %conv
+  %mul = mul nuw i32 %iv, %N.ext
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
-  %0 = load i16, ptr %arrayidx, align 2
-  %conv3 = sext i16 %0 to i32
-  %add1.next = add nsw i32 %add1, %conv3
+  %load = load i16, ptr %arrayidx
+  %sext = sext i16 %load to i32
+  store i32 %sext, ptr %A
   %iv.next = add nuw nsw i32 %iv, 1
-  %exitcond = icmp eq i32 %iv.next, %conv
+  %exitcond = icmp eq i32 %iv.next, %N.ext
   br i1 %exitcond, label %exit, label %loop
 
 exit:
-  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
-  ret i32 %ret
+  ret void
 }
 
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count, but the TC is sign-extended from a narrower type.
-define i32 @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %j) {
+define void @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %j) {
 ; CHECK-LABEL: 'unknown_stride_equalto_sext_tc'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP9:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        Against group ([[GRP10:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP9]]:
+; CHECK-NEXT:          (Low: %A High: (4 + %A))
+; CHECK-NEXT:            Member: %A
+; CHECK-NEXT:        Group [[GRP10]]:
+; CHECK-NEXT:          (Low: (((2 * (sext i32 %j to i64))<nsw> + %B) umin ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + (sext i16 %N to i32))<nsw> to i64) * (sext i16 %N to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64))<nsw> + %B) umax ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + (sext i16 %N to i32))<nsw> to i64) * (sext i16 %N to i64)) + %B))))
+; CHECK-NEXT:            Member: {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (sext i16 %N to i64))<nsw>}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%j,+,(sext i16 %N to i32)}<nw><%loop> Added Flags: <nssw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add:
+; CHECK-NEXT:        ((2 * (sext i32 {%j,+,(sext i16 %N to i32)}<nw><%loop> to i64))<nsw> + %B)
+; CHECK-NEXT:        --> {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (sext i16 %N to i64))<nsw>}<%loop>
 ;
 entry:
-  %conv = sext i16 %N to i32
+  %N.ext = sext i16 %N to i32
   %cmp = icmp eq i16 %N, 0
   br i1 %cmp, label %exit, label %loop
 
 loop:
-  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-  %mul = mul nuw i32 %iv, %conv
+  %mul = mul nuw i32 %iv, %N.ext
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
-  %0 = load i16, ptr %arrayidx, align 2
-  %conv3 = sext i16 %0 to i32
-  %add1.next = add nsw i32 %add1, %conv3
+  %load = load i16, ptr %arrayidx
+  %sext = sext i16 %load to i32
+  store i32 %sext, ptr %A
   %iv.next = add nuw nsw i32 %iv, 1
-  %exitcond = icmp eq i32 %iv.next, %conv
+  %exitcond = icmp eq i32 %iv.next, %N.ext
   br i1 %exitcond, label %exit, label %loop
 
 exit:
-  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
-  ret i32 %ret
+  ret void
 }
 
 ; Check the scenario where we have an unknown Stride, which happens to also be
 ; the loop iteration count, but the TC is truncated from a wider type.
-define i32 @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %j) {
+define void @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %j) {
 ; CHECK-LABEL: 'unknown_stride_equalto_trunc_tc'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP11:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        Against group ([[GRP12:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP11]]:
+; CHECK-NEXT:          (Low: %A High: (4 + %A))
+; CHECK-NEXT:            Member: %A
+; CHECK-NEXT:        Group [[GRP12]]:
+; CHECK-NEXT:          (Low: (((2 * (sext i32 %j to i64))<nsw> + %B) umin ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + (trunc i64 %N to i32)) to i64) * (sext i32 (trunc i64 %N to i32) to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64))<nsw> + %B) umax ((2 * (sext i32 %j to i64))<nsw> + (2 * (zext i32 (-1 + (trunc i64 %N to i32)) to i64) * (sext i32 (trunc i64 %N to i32) to i64)) + %B))))
+; CHECK-NEXT:            Member: {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (sext i32 (trunc i64 %N to i32) to i64))<nsw>}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%j,+,(trunc i64 %N to i32)}<nw><%loop> Added Flags: <nssw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add:
+; CHECK-NEXT:        ((2 * (sext i32 {%j,+,(trunc i64 %N to i32)}<nw><%loop> to i64))<nsw> + %B)
+; CHECK-NEXT:        --> {((2 * (sext i32 %j to i64))<nsw> + %B),+,(2 * (sext i32 (trunc i64 %N to i32) to i64))<nsw>}<%loop>
 ;
 entry:
-  %conv = trunc i64 %N to i32
+  %N.trunc = trunc i64 %N to i32
   %cmp = icmp eq i64 %N, 0
   br i1 %cmp, label %exit, label %loop
 
 loop:
-  %add1 = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
-  %mul = mul nuw i32 %iv, %conv
+  %mul = mul nuw i32 %iv, %N.trunc
   %add = add i32 %mul, %j
   %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
-  %0 = load i16, ptr %arrayidx, align 2
-  %conv3 = sext i16 %0 to i32
-  %add1.next = add nsw i32 %add1, %conv3
+  %load = load i16, ptr %arrayidx
+  %sext = sext i16 %load to i32
+  store i32 %sext, ptr %A
   %iv.next = add nuw nsw i32 %iv, 1
-  %exitcond = icmp eq i32 %iv.next, %conv
+  %exitcond = icmp eq i32 %iv.next, %N.trunc
   br i1 %exitcond, label %exit, label %loop
 
 exit:
-  %ret = phi i32 [ 0, %entry ], [ %add1.next, %loop ]
-  ret i32 %ret
+  ret void
 }