[llvm] LAA: make stride versioning code more robust (PR #97075)

Wed Jul 3 06:04:08 PDT 2024

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/97075

>From 86c6ad69ad21332a315f624052abc716d547a214 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 3 Jul 2024 13:48:48 +0100
Subject: [PATCH 1/2] LAA: add pre-commit tests for stride-versioning

Add tests for when the Stride is unknown and equal to TC, with different
kinds of casts. In these cases, LAA should not speculate on Stride.
---
 .../LoopAccessAnalysis/symbolic-stride.ll     | 181 ++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 7c1b11e22aef2..b14eb6d6aa3db 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -310,3 +310,184 @@ loop:
 exit:                                          ; preds = %loop
   ret void
 }
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count. If we speculate Stride==1, it implies that the loop
+; will iterate no more than a single iteration.
+define i32 @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %i, i32 %j)  {
+; CHECK-LABEL: 'unknown_stride_equalto_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
+  %k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul i32 %k.09, %N
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %add1 = add nsw i32 %tmp.010, %conv
+  %inc = add nuw i32 %k.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, but the TC is zero-extended from a narrower type.
+define i32 @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+; CHECK-LABEL: 'unknown_stride_equalto_zext_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %conv = zext i16 %N to i32
+  %cmp11 = icmp eq i16 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, but the TC is sign-extended from a narrower type.
+define i32 @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+; CHECK-LABEL: 'unknown_stride_equalto_sext_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %conv = sext i16 %N to i32
+  %cmp11 = icmp eq i16 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, but the TC is truncated from a wider type.
+define i32 @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %i, i32 %j) {
+; CHECK-LABEL: 'unknown_stride_equalto_trunc_tc'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %conv = trunc i64 %N to i32
+  %cmp11 = icmp eq i64 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}

>From 7b1876adf3657500c2310088b40f74bf8f038424 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 27 Jun 2024 17:12:33 +0100
Subject: [PATCH 2/2] LAA: make stride versioning code more robust

Rewrite the stride-versioning code in LoopAccessAnalysis more robust,
and make it possible to directly return the SCEVUnknown under a cast in
getStrideFromPointer, eliminating a second cast-stripping in
collectStridedAccess.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 47 ++++++++----------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 38bf6d8160aa9..cbf7630df935f 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2914,7 +2914,7 @@ static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *L
 
   if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(V))
     if (isa<SCEVUnknown>(C->getOperand()))
-      return V;
+      return C->getOperand();
 
   return nullptr;
 }
@@ -2930,7 +2930,8 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // computation of an interesting IV - but we chose not to as we
   // don't have a cost model here, and broadening the scope exposes
   // far too many unprofitable cases.
-  const SCEV *StrideExpr = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
+  ScalarEvolution *SE = PSE->getSE();
+  const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE, TheLoop);
   if (!StrideExpr)
     return;
 
@@ -2956,40 +2957,22 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // of various possible stride specializations, considering the alternatives
   // of using gather/scatters (if available).
 
-  const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();
-
-  // Match the types so we can compare the stride and the MaxBTC.
-  // The Stride can be positive/negative, so we sign extend Stride;
-  // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
-  const DataLayout &DL = TheLoop->getHeader()->getDataLayout();
-  uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
-  uint64_t BETypeSizeBits = DL.getTypeSizeInBits(MaxBTC->getType());
-  const SCEV *CastedStride = StrideExpr;
-  const SCEV *CastedBECount = MaxBTC;
-  ScalarEvolution *SE = PSE->getSE();
-  if (BETypeSizeBits >= StrideTypeSizeBits)
-    CastedStride = SE->getNoopOrSignExtend(StrideExpr, MaxBTC->getType());
-  else
-    CastedBECount = SE->getZeroExtendExpr(MaxBTC, StrideExpr->getType());
-  const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
-  // Since TripCount == BackEdgeTakenCount + 1, checking:
-  // "Stride >= TripCount" is equivalent to checking:
-  // Stride - MaxBTC> 0
-  if (SE->isKnownPositive(StrideMinusBETaken)) {
-    LLVM_DEBUG(
-        dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
-                  "Stride==1 predicate will imply that the loop executes "
-                  "at most once.\n");
+  // Stride is zero-extended or truncated, so BTC can be subtracted.
+  const SCEV *BTC = PSE->getBackedgeTakenCount();
+  const SCEV *CastedStride =
+      SE->getTruncateOrZeroExtend(StrideExpr, BTC->getType());
+  const SCEV *StrideMinusBTC = SE->getMinusSCEV(CastedStride, BTC);
+
+  // Stride - BTC > 0 is equivalent to Stride >= TripCount, but computing
+  // TripCount from BTC would introduce more casts, and Stride - TC might fail
+  // the known-non-negative test.
+  if (SE->isKnownPositive(StrideMinusBTC)) {
+    LLVM_DEBUG(dbgs() << "LAA: Not versioning with Stride==1 predicate.\n");
     return;
   }
   LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
 
-  // Strip back off the integer cast, and check that our result is a
-  // SCEVUnknown as we expect.
-  const SCEV *StrideBase = StrideExpr;
-  if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(StrideBase))
-    StrideBase = C->getOperand();
-  SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideBase);
+  SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideExpr);
 }
 
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,