[llvm] [SCEV] Add SafeWrap flag to AddRecs (PR #118483)

Tue Dec 3 04:47:31 PST 2024

https://github.com/OtherRandomUser created https://github.com/llvm/llvm-project/pull/118483

Added a flag to signal that an AddRec is expected to wrap. This allows for the vectorization of a few masking operations that would otherwise be discarded based on no wrap predicates, such as:

```
float arr1[8192];
float arr2[8192];
float arr3[8192];
float arr4[8192];
float arr5[8192];
float arr6[8192];
float arr7[8192];
float arr8[8192];
float arr9[8192];

void test() {
  for(int i = 0; i < 8192; i += 1) {
    int ind = i & 0x7f;
    arr1[ind] = i;
    arr2[ind] = i;
    arr3[ind] = i;
    arr4[ind] = i;
    arr5[ind] = i;
    arr6[ind] = i;
    arr7[ind] = i;
    arr8[ind] = i;
    arr9[ind] = i;
  }
}
```

There was also one last test, Transforms/LoopVectorize/X86/multi-exit-cost.ll, that failed. Adding the safe wrap flag in this case prevents any vectorization, but the vectorization done before doesn't make much sense to me, am I missing something?

>From 03de1c86bf636075ecd8dfd5f5574b9947277394 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BAlio=20De=20Bastiani?=
 <julio.bastiani at expertisesolutions.com.br>
Date: Thu, 21 Nov 2024 16:47:18 -0300
Subject: [PATCH] [SCEV] Add SafeWrap flag to AddRecs

Added a flag to signal that an AddRec is expected to wrap. This allows
for the vectorization of a few masking operations that would otherwise
be discarded based on no wrap predicates, such as:

```
float arr1[8192];
float arr2[8192];
float arr3[8192];
float arr4[8192];
float arr5[8192];
float arr6[8192];
float arr7[8192];
float arr8[8192];
float arr9[8192];

void test() {
  for(int i = 0; i < 8192; i += 1) {
    int ind = i & 0x7f;
    arr1[ind] = i;
    arr2[ind] = i;
    arr3[ind] = i;
    arr4[ind] = i;
    arr5[ind] = i;
    arr6[ind] = i;
    arr7[ind] = i;
    arr8[ind] = i;
    arr9[ind] = i;
  }
}
```
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  |  27 +++--
 .../Analysis/ScalarEvolutionExpressions.h     |   4 +
 llvm/lib/Analysis/ScalarEvolution.cpp         |  49 ++++----
 llvm/test/Analysis/ScalarEvolution/pr87798.ll |   4 +-
 .../ScalarEvolution/shift-recurrences.ll      |   2 +-
 .../ScalarEvolution/solve-quadratic-i1.ll     |   4 +-
 .../ScalarEvolution/solve-quadratic.ll        |   4 +-
 .../IndVarSimplify/shrunk-constant.ll         |   2 +-
 .../LoopVectorize/RISCV/safe-wrap.ll          | 112 ++++++++++++++++++
 .../X86/x86_fp80-vector-store.ll              |   3 +-
 10 files changed, 171 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/safe-wrap.ll

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index b20c6a13cb6bd7..586b6232647c1e 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -124,11 +124,12 @@ class SCEV : public FoldingSetNode {
   /// at runtime.  A SCEV being defined does not require the existence of any
   /// instruction within the defined scope.
   enum NoWrapFlags {
-    FlagAnyWrap = 0,    // No guarantee.
-    FlagNW = (1 << 0),  // No self-wrap.
-    FlagNUW = (1 << 1), // No unsigned wrap.
-    FlagNSW = (1 << 2), // No signed wrap.
-    NoWrapMask = (1 << 3) - 1
+    FlagAnyWrap = 0,         // No guarantee.
+    FlagSafeWrap = (1 << 0), // Expected to wrap.
+    FlagNW = (1 << 1),       // No self-wrap.
+    FlagNUW = (1 << 2),      // No unsigned wrap.
+    FlagNSW = (1 << 3),      // No signed wrap.
+    NoWrapMask = (1 << 4) - 1
   };
 
   explicit SCEV(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy,
@@ -562,7 +563,9 @@ class ScalarEvolution {
   const SCEV *getConstant(Type *Ty, uint64_t V, bool isSigned = false);
   const SCEV *getLosslessPtrToIntExpr(const SCEV *Op, unsigned Depth = 0);
   const SCEV *getPtrToIntExpr(const SCEV *Op, Type *Ty);
-  const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
+  const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty,
+                              SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap,
+                              unsigned Depth = 0);
   const SCEV *getVScale(Type *Ty);
   const SCEV *getElementCount(Type *Ty, ElementCount EC);
   const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
@@ -707,13 +710,17 @@ class ScalarEvolution {
 
   /// Return a SCEV corresponding to a conversion of the input value to the
   /// specified type.  If the type must be extended, it is zero extended.
-  const SCEV *getTruncateOrZeroExtend(const SCEV *V, Type *Ty,
-                                      unsigned Depth = 0);
+  const SCEV *
+  getTruncateOrZeroExtend(const SCEV *V, Type *Ty,
+                          SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap,
+                          unsigned Depth = 0);
 
   /// Return a SCEV corresponding to a conversion of the input value to the
   /// specified type.  If the type must be extended, it is sign extended.
-  const SCEV *getTruncateOrSignExtend(const SCEV *V, Type *Ty,
-                                      unsigned Depth = 0);
+  const SCEV *
+  getTruncateOrSignExtend(const SCEV *V, Type *Ty,
+                          SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap,
+                          unsigned Depth = 0);
 
   /// Return a SCEV corresponding to a conversion of the input value to the
   /// specified type.  If the type must be extended, it is zero extended.  The
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
index 6eb1aca1cf76ad..6a894bd9dcb4f4 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -233,6 +233,10 @@ class SCEVNAryExpr : public SCEV {
 
   bool hasNoSelfWrap() const { return getNoWrapFlags(FlagNW) != FlagAnyWrap; }
 
+  bool hasSafeWrap() const {
+    return getNoWrapFlags(FlagSafeWrap) != FlagAnyWrap;
+  }
+
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const SCEV *S) {
     return S->getSCEVType() == scAddExpr || S->getSCEVType() == scMulExpr ||
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 882e938e69c0c2..3c6fb4498b9d64 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -317,6 +317,9 @@ void SCEV::print(raw_ostream &OS) const {
     if (AR->hasNoSelfWrap() &&
         !AR->getNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW)))
       OS << "nw><";
+    if (AR->hasSafeWrap() &&
+        !AR->getNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW | FlagNW)))
+      OS << "sw><";
     AR->getLoop()->getHeader()->printAsOperand(OS, /*PrintType=*/false);
     OS << ">";
     return;
@@ -1158,6 +1161,7 @@ const SCEV *ScalarEvolution::getPtrToIntExpr(const SCEV *Op, Type *Ty) {
 }
 
 const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
+                                             SCEV::NoWrapFlags Flags,
                                              unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) &&
          "This is not a truncating conversion!");
@@ -1180,15 +1184,15 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
 
   // trunc(trunc(x)) --> trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op))
-    return getTruncateExpr(ST->getOperand(), Ty, Depth + 1);
+    return getTruncateExpr(ST->getOperand(), Ty, Flags, Depth + 1);
 
   // trunc(sext(x)) --> sext(x) if widening or trunc(x) if narrowing
   if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
-    return getTruncateOrSignExtend(SS->getOperand(), Ty, Depth + 1);
+    return getTruncateOrSignExtend(SS->getOperand(), Ty, Flags, Depth + 1);
 
   // trunc(zext(x)) --> zext(x) if widening or trunc(x) if narrowing
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
-    return getTruncateOrZeroExtend(SZ->getOperand(), Ty, Depth + 1);
+    return getTruncateOrZeroExtend(SZ->getOperand(), Ty, Flags, Depth + 1);
 
   if (Depth > MaxCastDepth) {
     SCEV *S =
@@ -1208,7 +1212,8 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
     unsigned numTruncs = 0;
     for (unsigned i = 0, e = CommOp->getNumOperands(); i != e && numTruncs < 2;
          ++i) {
-      const SCEV *S = getTruncateExpr(CommOp->getOperand(i), Ty, Depth + 1);
+      const SCEV *S =
+          getTruncateExpr(CommOp->getOperand(i), Ty, Flags, Depth + 1);
       if (!isa<SCEVIntegralCastExpr>(CommOp->getOperand(i)) &&
           isa<SCEVTruncateExpr>(S))
         numTruncs++;
@@ -1232,8 +1237,8 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, Type *Ty,
   if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
     SmallVector<const SCEV *, 4> Operands;
     for (const SCEV *Op : AddRec->operands())
-      Operands.push_back(getTruncateExpr(Op, Ty, Depth + 1));
-    return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap);
+      Operands.push_back(getTruncateExpr(Op, Ty, Flags, Depth + 1));
+    return getAddRecExpr(Operands, AddRec->getLoop(), Flags);
   }
 
   // Return zero if truncating to known zeros.
@@ -1632,7 +1637,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
     unsigned NewBits = getTypeSizeInBits(Ty);
     if (CR.truncate(TruncBits).zeroExtend(NewBits).contains(
             CR.zextOrTrunc(NewBits)))
-      return getTruncateOrZeroExtend(X, Ty, Depth);
+      return getTruncateOrZeroExtend(X, Ty, SCEV::FlagAnyWrap, Depth);
   }
 
   // If the input value is a chrec scev, and we can prove that the value
@@ -1669,10 +1674,10 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
 
         // Check whether the backedge-taken count can be losslessly casted to
         // the addrec's type. The count is always unsigned.
-        const SCEV *CastedMaxBECount =
-            getTruncateOrZeroExtend(MaxBECount, Start->getType(), Depth);
+        const SCEV *CastedMaxBECount = getTruncateOrZeroExtend(
+            MaxBECount, Start->getType(), SCEV::FlagAnyWrap, Depth);
         const SCEV *RecastedMaxBECount = getTruncateOrZeroExtend(
-            CastedMaxBECount, MaxBECount->getType(), Depth);
+            CastedMaxBECount, MaxBECount->getType(), SCEV::FlagAnyWrap, Depth);
         if (MaxBECount == RecastedMaxBECount) {
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no unsigned overflow.
@@ -1973,7 +1978,7 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
     unsigned NewBits = getTypeSizeInBits(Ty);
     if (CR.truncate(TruncBits).signExtend(NewBits).contains(
             CR.sextOrTrunc(NewBits)))
-      return getTruncateOrSignExtend(X, Ty, Depth);
+      return getTruncateOrSignExtend(X, Ty, SCEV::FlagAnyWrap, Depth);
   }
 
   if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
@@ -2044,10 +2049,10 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
 
         // Check whether the backedge-taken count can be losslessly casted to
         // the addrec's type. The count is always unsigned.
-        const SCEV *CastedMaxBECount =
-            getTruncateOrZeroExtend(MaxBECount, Start->getType(), Depth);
+        const SCEV *CastedMaxBECount = getTruncateOrZeroExtend(
+            MaxBECount, Start->getType(), SCEV::FlagAnyWrap, Depth);
         const SCEV *RecastedMaxBECount = getTruncateOrZeroExtend(
-            CastedMaxBECount, MaxBECount->getType(), Depth);
+            CastedMaxBECount, MaxBECount->getType(), SCEV::FlagAnyWrap, Depth);
         if (MaxBECount == RecastedMaxBECount) {
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no signed overflow.
@@ -4714,6 +4719,7 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
 }
 
 const SCEV *ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty,
+                                                     SCEV::NoWrapFlags Flags,
                                                      unsigned Depth) {
   Type *SrcTy = V->getType();
   assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
@@ -4721,11 +4727,12 @@ const SCEV *ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty,
   if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
     return V;  // No conversion
   if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
-    return getTruncateExpr(V, Ty, Depth);
+    return getTruncateExpr(V, Ty, Flags, Depth);
   return getZeroExtendExpr(V, Ty, Depth);
 }
 
 const SCEV *ScalarEvolution::getTruncateOrSignExtend(const SCEV *V, Type *Ty,
+                                                     SCEV::NoWrapFlags Flags,
                                                      unsigned Depth) {
   Type *SrcTy = V->getType();
   assert(SrcTy->isIntOrPtrTy() && Ty->isIntOrPtrTy() &&
@@ -4733,7 +4740,7 @@ const SCEV *ScalarEvolution::getTruncateOrSignExtend(const SCEV *V, Type *Ty,
   if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
     return V;  // No conversion
   if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
-    return getTruncateExpr(V, Ty, Depth);
+    return getTruncateExpr(V, Ty, Flags, Depth);
   return getSignExtendExpr(V, Ty, Depth);
 }
 
@@ -7837,8 +7844,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
             ShiftedLHS = getUDivExpr(LHS, MulCount);
           return getMulExpr(
               getZeroExtendExpr(
-                  getTruncateExpr(ShiftedLHS,
-                      IntegerType::get(getContext(), BitWidth - LZ - TZ)),
+                  getTruncateExpr(
+                      ShiftedLHS,
+                      IntegerType::get(getContext(), BitWidth - LZ - TZ),
+                      SCEV::FlagSafeWrap, 0),
                   BO->LHS->getType()),
               MulCount);
         }
@@ -14790,7 +14799,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
   const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
     const SCEV *Operand = visit(Expr->getOperand());
     const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Operand);
-    if (AR && AR->getLoop() == L && AR->isAffine()) {
+    if (AR && AR->getLoop() == L && AR->isAffine() && !AR->hasSafeWrap()) {
       // This couldn't be folded because the operand didn't have the nuw
       // flag. Add the nusw flag as an assumption that we could make.
       const SCEV *Step = AR->getStepRecurrence(SE);
@@ -14806,7 +14815,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
   const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
     const SCEV *Operand = visit(Expr->getOperand());
     const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Operand);
-    if (AR && AR->getLoop() == L && AR->isAffine()) {
+    if (AR && AR->getLoop() == L && AR->isAffine() && !AR->hasSafeWrap()) {
       // This couldn't be folded because the operand didn't have the nsw
       // flag. Add the nssw flag as an assumption that we could make.
       const SCEV *Step = AR->getStepRecurrence(SE);
diff --git a/llvm/test/Analysis/ScalarEvolution/pr87798.ll b/llvm/test/Analysis/ScalarEvolution/pr87798.ll
index acd445993e47bc..c019c375cbb184 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr87798.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr87798.ll
@@ -25,11 +25,11 @@ define i32 @pr87798() {
 ; CHECK-NEXT:    %add4 = add i32 %mul, %phi
 ; CHECK-NEXT:    --> {0,+,0,+,2,+,5,+,3}<%bb1> U: full-set S: full-set Exits: 0 LoopDispositions: { %bb1: Computable }
 ; CHECK-NEXT:    %and = and i32 %phi, 1
-; CHECK-NEXT:    --> (zext i1 {false,+,false,+,false,+,false,+,true}<%bb1> to i32) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %bb1: Computable }
+; CHECK-NEXT:    --> (zext i1 {false,+,false,+,false,+,false,+,true}<sw><%bb1> to i32) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %bb1: Computable }
 ; CHECK-NEXT:    %add5 = add i32 %phi3, 1
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%bb1> U: [1,2) S: [1,2) Exits: 1 LoopDispositions: { %bb1: Computable }
 ; CHECK-NEXT:    %phi9 = phi i32 [ %and, %bb1 ]
-; CHECK-NEXT:    --> (zext i1 {false,+,false,+,false,+,false,+,true}<%bb1> to i32) U: [0,2) S: [0,2) --> 0 U: [0,1) S: [0,1)
+; CHECK-NEXT:    --> (zext i1 {false,+,false,+,false,+,false,+,true}<sw><%bb1> to i32) U: [0,2) S: [0,2) --> 0 U: [0,1) S: [0,1)
 ; CHECK-NEXT:    %zext = zext i32 %phi9 to i64
 ; CHECK-NEXT:    --> poison U: full-set S: full-set
 ; CHECK-NEXT:  Determining loop execution counts for: @pr87798
diff --git a/llvm/test/Analysis/ScalarEvolution/shift-recurrences.ll b/llvm/test/Analysis/ScalarEvolution/shift-recurrences.ll
index 6cd709bfff68f3..ed9220f9e001a2 100644
--- a/llvm/test/Analysis/ScalarEvolution/shift-recurrences.ll
+++ b/llvm/test/Analysis/ScalarEvolution/shift-recurrences.ll
@@ -323,7 +323,7 @@ define void @test_shl6(i1 %c) {
 ; CHECK-NEXT:    %iv.next = add i64 %iv, 1
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop> U: [1,6) S: [1,6) Exits: 5 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %shiftamt = and i64 %iv, 1
-; CHECK-NEXT:    --> (zext i1 {false,+,true}<%loop> to i64) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> (zext i1 {false,+,true}<sw><%loop> to i64) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.shl.next = shl i64 %iv.shl, %shiftamt
 ; CHECK-NEXT:    --> %iv.shl.next U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: 16 LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_shl6
diff --git a/llvm/test/Analysis/ScalarEvolution/solve-quadratic-i1.ll b/llvm/test/Analysis/ScalarEvolution/solve-quadratic-i1.ll
index fa4e5fb0ac433e..bcf156bb949411 100644
--- a/llvm/test/Analysis/ScalarEvolution/solve-quadratic-i1.ll
+++ b/llvm/test/Analysis/ScalarEvolution/solve-quadratic-i1.ll
@@ -15,7 +15,7 @@ define void @f0() {
 ; CHECK-NEXT:    %v3 = add nsw i16 %v1, %v0
 ; CHECK-NEXT:    --> {3,+,3,+,1}<%b1> U: full-set S: full-set Exits: 6 LoopDispositions: { %b1: Computable }
 ; CHECK-NEXT:    %v4 = and i16 %v3, 1
-; CHECK-NEXT:    --> (zext i1 {true,+,true,+,true}<%b1> to i16) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %b1: Computable }
+; CHECK-NEXT:    --> (zext i1 {true,+,true,+,true}<sw><%b1> to i16) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %b1: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @f0
 ; CHECK-NEXT:  Loop %b1: backedge-taken count is i6 1
 ; CHECK-NEXT:  Loop %b1: constant max backedge-taken count is i6 1
@@ -54,7 +54,7 @@ define void @f1() #0 {
 ; CHECK-NEXT:    %v3 = add i16 %v0, %v2
 ; CHECK-NEXT:    --> {3,+,4,+,1}<%b1> U: full-set S: full-set Exits: 12 LoopDispositions: { %b1: Computable }
 ; CHECK-NEXT:    %v4 = and i16 %v3, 1
-; CHECK-NEXT:    --> (zext i1 {true,+,false,+,true}<%b1> to i16) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %b1: Computable }
+; CHECK-NEXT:    --> (zext i1 {true,+,false,+,true}<sw><%b1> to i16) U: [0,2) S: [0,2) Exits: 0 LoopDispositions: { %b1: Computable }
 ; CHECK-NEXT:    %v6 = add nuw nsw i32 %v1, 1
 ; CHECK-NEXT:    --> {4,+,1}<nuw><nsw><%b1> U: [4,7) S: [4,7) Exits: 6 LoopDispositions: { %b1: Computable }
 ; CHECK-NEXT:    %v7 = phi i32 [ %v1, %b1 ]
diff --git a/llvm/test/Analysis/ScalarEvolution/solve-quadratic.ll b/llvm/test/Analysis/ScalarEvolution/solve-quadratic.ll
index fd02ef672a969e..bafe0606d8cd9b 100644
--- a/llvm/test/Analysis/ScalarEvolution/solve-quadratic.ll
+++ b/llvm/test/Analysis/ScalarEvolution/solve-quadratic.ll
@@ -41,7 +41,7 @@
 ; {14,+,14,+,14} -> X=0, Y=14, Z=14
 ;
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test01'
-; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {-2,+,-2,+,-2}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {-2,+,-2,+,-2}<sw><%loop>
 ; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 4
 ; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation -2x^2 + -2x + -4, coeff bw: 5, multiplied by 2
 ; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow
@@ -117,7 +117,7 @@ exit:
 ; {17,+,-1,+,2} -> X=-3, Y=20, Z=2
 ;
 ; CHECK-LABEL: Printing analysis 'Scalar Evolution Analysis' for function 'test03':
-; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {1,+,-1,+,2}<%loop>
+; CHECK: {{.*}}GetQuadraticEquation{{.*}}: analyzing quadratic addrec: {1,+,-1,+,2}<sw><%loop>
 ; CHECK: {{.*}}GetQuadraticEquation{{.*}}: addrec coeff bw: 4
 ; CHECK: {{.*}}GetQuadraticEquation{{.*}}: equation 2x^2 + -4x + 2, coeff bw: 5, multiplied by 2
 ; CHECK: {{.*}}SolveQuadraticAddRecExact{{.*}}: solving for unsigned overflow
diff --git a/llvm/test/Transforms/IndVarSimplify/shrunk-constant.ll b/llvm/test/Transforms/IndVarSimplify/shrunk-constant.ll
index 8ec8ec2c9e1f2c..5fcff4b98629b2 100644
--- a/llvm/test/Transforms/IndVarSimplify/shrunk-constant.ll
+++ b/llvm/test/Transforms/IndVarSimplify/shrunk-constant.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -passes='print<scalar-evolution>' 2>&1 | FileCheck %s
 
-; CHECK: -->  (1 + (zext i4 {-8,+,-8}<%loop> to i32))<nuw><nsw>
+; CHECK: -->  (1 + (zext i4 {-8,+,-8}<sw><%loop> to i32))<nuw><nsw>
 
 define fastcc void @foo() nounwind {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-wrap.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-wrap.ll
new file mode 100644
index 00000000000000..5f9275ba72cae5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-wrap.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-vectorize,simplifycfg,instcombine -force-vector-interleave=1 -force-vector-width=4 -mtriple=riscv64 -mattr=+v < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+ at arr1 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr2 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr3 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr4 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr5 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr6 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr7 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr8 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+ at arr9 = dso_local local_unnamed_addr global [8192 x float] zeroinitializer, align 4
+
+define dso_local noundef float @loop3(i32 noundef signext %zero) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local noundef float @loop3(
+; CHECK-SAME: i32 noundef signext [[ZERO:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[VEC_IND]], splat (i32 127)
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp nneg <4 x i32> [[VEC_IND]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg <4 x i32> [[TMP0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8192 x float], ptr @arr1, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [8192 x float], ptr @arr2, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8192 x float], ptr @arr3, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [8192 x float], ptr @arr4, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP6]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [8192 x float], ptr @arr5, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP7]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [8192 x float], ptr @arr6, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP8]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [8192 x float], ptr @arr7, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP9]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [8192 x float], ptr @arr8, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP10]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [8192 x float], ptr @arr9, i64 0, <4 x i64> [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> [[TMP1]], <4 x ptr> [[TMP11]], i32 4, <4 x i1> splat (i1 true)), !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8192
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret float 0.000000e+00
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.044 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %and = and i32 %i.044, 127
+  %conv = uitofp nneg i32 %i.044 to float
+  %idxprom = zext nneg i32 %and to i64
+  %arrayidx = getelementptr inbounds [8192 x float], ptr @arr1, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx, align 4, !tbaa !9
+  %arrayidx3 = getelementptr inbounds [8192 x float], ptr @arr2, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx3, align 4, !tbaa !9
+  %arrayidx6 = getelementptr inbounds [8192 x float], ptr @arr3, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx6, align 4, !tbaa !9
+  %arrayidx9 = getelementptr inbounds [8192 x float], ptr @arr4, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx9, align 4, !tbaa !9
+  %arrayidx12 = getelementptr inbounds [8192 x float], ptr @arr5, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx12, align 4, !tbaa !9
+  %arrayidx15 = getelementptr inbounds [8192 x float], ptr @arr6, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx15, align 4, !tbaa !9
+  %arrayidx18 = getelementptr inbounds [8192 x float], ptr @arr7, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx18, align 4, !tbaa !9
+  %arrayidx21 = getelementptr inbounds [8192 x float], ptr @arr8, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx21, align 4, !tbaa !9
+  %arrayidx24 = getelementptr inbounds [8192 x float], ptr @arr9, i64 0, i64 %idxprom
+  store float %conv, ptr %arrayidx24, align 4, !tbaa !9
+  %add = add nuw nsw i32 %i.044, 1
+  %exitcond.not = icmp eq i32 %add, 8192
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nofree noinline norecurse nosync nounwind memory(write, argmem: none, inaccessiblemem: none) uwtable vscale_range(4,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zba,+zbb,+zbs,+zicond,+zicsr,+zifencei,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl256b,+zvl32b,+zvl64b,-b,-e,-experimental-smctr,-experimental-smmpm,-experimental-smnpm,-experimental-ssctr,-experimental-ssnpm,-experimental-sspm,-experimental-supm,-experimental-zacas,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-h,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xventanamatmul,-xwchc,-za128rs,-za64rs,-zaamo,-zabha,-zalrsc,-zama16b,-zawrs,-zbc,-zbkb,-zbkc,-zbkx,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+
+!llvm.module.flags = !{!0, !1, !2, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 6, !"riscv-isa", !3}
+!3 = !{!"rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zicond1p0_zicsr2p0_zifencei2p0_zmmul1p0_zba1p0_zbb1p0_zbs1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl256b1p0_zvl32b1p0_zvl64b1p0"}
+!4 = !{i32 8, !"PIC Level", i32 2}
+!5 = !{i32 7, !"PIE Level", i32 2}
+!6 = !{i32 7, !"uwtable", i32 2}
+!7 = !{i32 8, !"SmallDataLimit", i32 0}
+!8 = !{!"clang version 20.0.0git (git at github.com:expertisesolutions/llvm.git 18fcedf2e278ec646fddb9b2d24a5abe98cdb7ed)"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"float", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+; CHECK: [[META10]] = !{!"float", [[META11:![0-9]+]], i64 0}
+; CHECK: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0}
+; CHECK: [[META12]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]]}
+; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META15]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
index 921cf4246f7259..15cb3e5b062536 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -33,8 +33,7 @@ for.body:                                         ; preds = %for.body, %entry
   %arrayidx = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 %indvars.iv
   store x86_fp80 %conv, ptr %arrayidx, align 16
   %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body