[llvm] [Instcombine] Lower to explicit subtraction + unsigned comparison (PR #170896)

via llvm-commits llvm-commits at lists.llvm.org
Sat Dec 6 09:46:37 PST 2025


https://github.com/aabhinavg1 updated https://github.com/llvm/llvm-project/pull/170896

>From b268573d6a03fe14e22a7703dcd6a284e5d0ca9a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:43:14 +0530
Subject: [PATCH 1/4] [Instcombine] Lower  to explicit subtraction + unsigned
 comparison

---
 .../InstCombine/InstCombineCalls.cpp          | 13 ++++
 .../test/Transforms/InstCombine/known-bits.ll | 15 +++--
 llvm/test/Transforms/InstCombine/pr170634.ll  | 33 ++++++++++
 ...ult-of-usub-is-non-zero-and-no-overflow.ll | 60 +++++++++----------
 .../usub-overflow-known-by-implied-cond.ll    | 40 +++++--------
 llvm/test/Transforms/InstCombine/usubo.ll     | 10 ++--
 .../Transforms/InstCombine/with_overflow.ll   |  7 ++-
 7 files changed, 108 insertions(+), 70 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/pr170634.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 743c4f574e131..af85985843914 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,6 +864,19 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
   if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
+    
+  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y} 
+  if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
+    IRBuilder<> Builder(WO);
+    Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
+    Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
+    
+    Value *ResultStruct = UndefValue::get(WO->getType());
+    ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
+    ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
+  
+    return replaceInstUsesWith(*WO, ResultStruct);
+  }
 
   // See whether we can optimize the overflow check with assumption information.
   for (User *U : WO->users()) {
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index da2123a5dfe74..fc73ce5503ffe 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 ; CHECK-LABEL: @extract_value_usub(
 ; CHECK-NEXT:    [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = xor i8 [[ZZ]], -1
+; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[ZZ]], -1
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %z = add nuw i8 %zz, 1
   %y = add i8 %x, %z
@@ -1090,12 +1090,11 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
 ; CHECK-LABEL: @extract_value_usub_fail(
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 0, [[Z]]
+; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[SUB]], 0
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[Z]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
new file mode 100644
index 0000000000000..62a332e14b04a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
+; CHECK-LABEL: @func(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %0 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+  %1 = extractvalue { i64, i1 } %0, 1
+  %2 = extractvalue { i64, i1 } %0, 0
+  br i1 %1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %return
+
+if.end:                                           ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i64 [ 291, %if.then ], [ %2, %if.end ]
+  ret i64 %retval.0
+}
+
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 30a5072c7edc8..46b8a853e6cf5 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
 
 define i1 @t2(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
 
 define i1 @t2_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
 
 define i1 @t8(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
 
 define i1 @t8_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index 90ca39a70a0bb..c9030e5ab0321 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,11 +175,10 @@ define i32 @test7(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -205,11 +204,10 @@ define i32 @test8(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -296,11 +294,10 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -328,11 +325,10 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -360,11 +356,10 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -392,11 +387,10 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -424,11 +418,10 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -456,11 +449,10 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index 2074190a2cd45..e4b9c0e08ba22 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,10 +130,9 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
 ; CHECK-NEXT:    ret i1 [[EQ1]]
 ;
@@ -149,10 +148,9 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
 ; CHECK-NEXT:    ret i1 [[SGT0]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index fa810408730e1..4f7a15cc89d6c 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; RUN: opt -passes='instcombine<no-verify-fixpoint>' -S < %s | FileCheck %s
 
 declare { i8, i1 } @llvm.uadd.with.overflow.i8(i8, i8) nounwind readnone
 declare { i8, i1 } @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
@@ -506,7 +506,10 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 
 define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 ; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT:    [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 42, [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT:    [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
 ; CHECK-NEXT:    ret { i32, i1 } [[A]]
 ;
   %a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)

>From e3fdf8dd13a1a8c3fc3ea7dd1916762d95276570 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:46:22 +0530
Subject: [PATCH 2/4] formated with git clang-format HEAD~1

---
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index af85985843914..3bd7eb855b147 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,17 +864,17 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
   if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
-    
-  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y} 
+
+  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
   if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
     IRBuilder<> Builder(WO);
     Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
     Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-    
+
     Value *ResultStruct = UndefValue::get(WO->getType());
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-  
+
     return replaceInstUsesWith(*WO, ResultStruct);
   }
 

>From aeef41f725b96ec57f72c2eb9788735419ae7172 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 00:27:48 +0530
Subject: [PATCH 3/4] fix formatting and replace undef with poison

---
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp |  2 +-
 .../result-of-usub-is-non-zero-and-no-overflow.ll    | 12 ++++++------
 llvm/test/Transforms/InstCombine/with_overflow.ll    |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3bd7eb855b147..d0b71f12c3159 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -871,7 +871,7 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
     Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
     Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
 
-    Value *ResultStruct = UndefValue::get(WO->getType());
+    Value *ResultStruct = PoisonValue::get(WO->getType());
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
 
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 46b8a853e6cf5..f8b318bc3680a 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -143,7 +143,7 @@ define i1 @t2(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -170,7 +170,7 @@ define i1 @t2_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2_logical(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -323,7 +323,7 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -350,7 +350,7 @@ define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability_logical(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -461,7 +461,7 @@ define i1 @t8(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -484,7 +484,7 @@ define i1 @t8_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8_logical(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 4f7a15cc89d6c..0c82bdc256ddf 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -508,7 +508,7 @@ define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 ; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
 ; CHECK-NEXT:    ret { i32, i1 } [[A]]
 ;

>From 87d56d3d369db1fef1789ccbc3f7890e30daa96a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 23:16:05 +0530
Subject: [PATCH 4/4] Address review feedback

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |    4 +-
 .../InstCombine/InstCombineCalls.cpp          |   13 -
 .../test/CodeGen/RISCV/arith-with-overflow.ll |    7 +-
 .../CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll |   24 +-
 llvm/test/CodeGen/RISCV/rvv/abs-vp.ll         |    4 +-
 llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll  |    8 +-
 llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll        |   44 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |   16 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       |   46 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        |   68 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll |   16 +-
 .../RISCV/rvv/fixed-vectors-bitreverse-vp.ll  |   16 +-
 .../RISCV/rvv/fixed-vectors-bswap-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-ceil-vp.ll        |   64 +-
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        |  432 ++-
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       |   76 +-
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        |  244 +-
 .../RISCV/rvv/fixed-vectors-floor-vp.ll       |   64 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |   77 +-
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |   77 +-
 .../RISCV/rvv/fixed-vectors-fpext-vp.ll       |    8 +-
 .../RISCV/rvv/fixed-vectors-fptosi-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-fptoui-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-fptrunc-vp.ll     |    8 +-
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |   32 +-
 .../rvv/fixed-vectors-reduction-fp-vp.ll      |   16 +-
 .../rvv/fixed-vectors-reduction-int-vp.ll     |    8 +-
 .../rvv/fixed-vectors-reduction-mask-vp.ll    |    8 +-
 .../RISCV/rvv/fixed-vectors-rint-vp.ll        |   32 +-
 .../RISCV/rvv/fixed-vectors-round-vp.ll       |   64 +-
 .../RISCV/rvv/fixed-vectors-roundeven-vp.ll   |   64 +-
 .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll |   64 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    |   16 +-
 .../RISCV/rvv/fixed-vectors-setcc-int-vp.ll   |   50 +-
 .../RISCV/rvv/fixed-vectors-sext-vp.ll        |   16 +-
 .../RISCV/rvv/fixed-vectors-sitofp-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-strided-vpload.ll |   74 +-
 .../rvv/fixed-vectors-strided-vpstore.ll      |   18 +-
 .../RISCV/rvv/fixed-vectors-trunc-vp.ll       |  299 +-
 .../RISCV/rvv/fixed-vectors-uitofp-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-vadd-vp.ll        |   32 +-
 .../RISCV/rvv/fixed-vectors-vcopysign-vp.ll   |   16 +-
 .../RISCV/rvv/fixed-vectors-vfabs-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfma-vp.ll        |   48 +-
 .../RISCV/rvv/fixed-vectors-vfmax-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfmin-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll    |   48 +-
 .../RISCV/rvv/fixed-vectors-vfneg-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfsqrt-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-vmax-vp.ll        |   24 +-
 .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll       |   24 +-
 .../RISCV/rvv/fixed-vectors-vmin-vp.ll        |   24 +-
 .../RISCV/rvv/fixed-vectors-vminu-vp.ll       |   24 +-
 .../RISCV/rvv/fixed-vectors-vpgather.ll       |  184 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll |   24 +-
 .../RISCV/rvv/fixed-vectors-vpmerge.ll        |   16 +-
 .../RISCV/rvv/fixed-vectors-vpscatter.ll      |   64 +-
 .../RISCV/rvv/fixed-vectors-vpstore.ll        |    8 +-
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       |   32 +-
 .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll      |   32 +-
 .../RISCV/rvv/fixed-vectors-vselect-vp.ll     |   28 +-
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       |   32 +-
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      |   32 +-
 .../RISCV/rvv/fixed-vectors-zext-vp.ll        |   16 +-
 llvm/test/CodeGen/RISCV/rvv/floor-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |   68 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |   68 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |    4 +-
 llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll      |    2 +-
 llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll   |   52 +-
 .../RISCV/rvv/nontemporal-vp-scalable.ll      | 3010 ++++++++---------
 llvm/test/CodeGen/RISCV/rvv/rint-vp.ll        |   52 +-
 llvm/test/CodeGen/RISCV/rvv/round-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll   |   52 +-
 llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll |   52 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  869 +++--
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |  297 +-
 llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll |   50 +-
 .../test/CodeGen/RISCV/rvv/strided-vpstore.ll |   82 +-
 llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll        |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |  531 +--
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       |  276 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        | 2956 ++++------------
 llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll       |  139 +-
 llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll    |    6 +-
 llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll      |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll     |    6 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll     |    6 +-
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll      |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       |  276 +-
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll        |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll       |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll        |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll       |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vp-splat.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vp-splice.ll      |  140 +-
 .../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll |   28 +-
 llvm/test/CodeGen/RISCV/rvv/vpload.ll         |   34 +-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll |   18 +-
 .../CodeGen/RISCV/rvv/vpscatter-sdnode.ll     |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vpstore.ll        |   20 +-
 .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll    |    4 +-
 .../CodeGen/RISCV/rvv/vreductions-int-vp.ll   |    2 +-
 .../CodeGen/RISCV/rvv/vreductions-mask-vp.ll  |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll      |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll      |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/usub_sat.ll           |   48 +-
 llvm/test/CodeGen/RISCV/usub_sat_plus.ll      |   44 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              |  129 +-
 llvm/test/CodeGen/RISCV/xqcia.ll              |    6 +-
 .../test/Transforms/InstCombine/known-bits.ll |   15 +-
 llvm/test/Transforms/InstCombine/pr170634.ll  |    5 +-
 ...ult-of-usub-is-non-zero-and-no-overflow.ll |   60 +-
 .../usub-overflow-known-by-implied-cond.ll    |   40 +-
 llvm/test/Transforms/InstCombine/usubo.ll     |   10 +-
 .../Transforms/InstCombine/with_overflow.ll   |    5 +-
 132 files changed, 5135 insertions(+), 7744 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 172c7485e108b..8b46c4c1e66db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11466,7 +11466,9 @@ void TargetLowering::expandUADDSUBO(
                      DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETNE);
   } else {
     ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
-    SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
+    SDValue CompareLHS = IsAdd ? Result : LHS;
+    SDValue CompareRHS = IsAdd ? LHS : RHS;
+    SetCC = DAG.getSetCC(dl, SetCCType, CompareLHS, CompareRHS, CC);
   }
   Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d0b71f12c3159..743c4f574e131 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -865,19 +865,6 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
 
-  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
-  if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
-    IRBuilder<> Builder(WO);
-    Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
-    Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-
-    Value *ResultStruct = PoisonValue::get(WO->getType());
-    ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
-    ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-
-    return replaceInstUsesWith(*WO, ResultStruct);
-  }
-
   // See whether we can optimize the overflow check with assumption information.
   for (User *U : WO->users()) {
     if (!match(U, m_ExtractValue<1>(m_Value())))
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 557b4b7c2afa2..84526a1fca0f9 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -54,9 +54,10 @@ entry:
 define i1 @usub(i32 %a, i32 %b, ptr %c) nounwind {
 ; RV32I-LABEL: usub:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    sltu a3, a1, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    ret
 entry:
   %x = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index ea9786d0b10b3..f5f122a8c9dd7 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -715,7 +715,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    zext.b a0, a3
 ; RV32I-NEXT:    sub a1, a0, s1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sltu a0, s1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a2, a0, a1
 ; RV32I-NEXT:    sb a3, 3(sp)
@@ -755,7 +755,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV32IA-NEXT:    srl a4, a4, a0
 ; RV32IA-NEXT:    zext.b a4, a4
 ; RV32IA-NEXT:    sub a6, a4, a1
-; RV32IA-NEXT:    sltu a4, a4, a6
+; RV32IA-NEXT:    sltu a4, a1, a4
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    and a4, a4, a6
 ; RV32IA-NEXT:    sll a4, a4, a0
@@ -792,7 +792,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    zext.b a0, a3
 ; RV64I-NEXT:    sub a1, a0, s1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sltu a0, s1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a2, a0, a1
 ; RV64I-NEXT:    sb a3, 7(sp)
@@ -832,7 +832,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV64IA-NEXT:    sext.w a6, a3
 ; RV64IA-NEXT:    zext.b a5, a5
 ; RV64IA-NEXT:    sub a7, a5, a1
-; RV64IA-NEXT:    sltu a5, a5, a7
+; RV64IA-NEXT:    sltu a5, a1, a5
 ; RV64IA-NEXT:    addi a5, a5, -1
 ; RV64IA-NEXT:    and a5, a5, a7
 ; RV64IA-NEXT:    sllw a5, a5, a0
@@ -877,7 +877,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    and a0, a3, s1
 ; RV32I-NEXT:    sub a1, a0, s2
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sltu a0, s2, a0
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a2, a0, a1
 ; RV32I-NEXT:    sh a3, 14(sp)
@@ -920,7 +920,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    srl a5, a5, a0
 ; RV32IA-NEXT:    and a5, a5, a3
 ; RV32IA-NEXT:    sub a7, a5, a1
-; RV32IA-NEXT:    sltu a5, a5, a7
+; RV32IA-NEXT:    sltu a5, a1, a5
 ; RV32IA-NEXT:    addi a5, a5, -1
 ; RV32IA-NEXT:    and a5, a5, a7
 ; RV32IA-NEXT:    sll a5, a5, a0
@@ -961,7 +961,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    and a0, a3, s1
 ; RV64I-NEXT:    sub a1, a0, s2
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sltu a0, s2, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a2, a0, a1
 ; RV64I-NEXT:    sh a3, 14(sp)
@@ -1004,7 +1004,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    sext.w a7, a4
 ; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    sub t0, a6, a1
-; RV64IA-NEXT:    sltu a6, a6, t0
+; RV64IA-NEXT:    sltu a6, a1, a6
 ; RV64IA-NEXT:    addi a6, a6, -1
 ; RV64IA-NEXT:    and a6, a6, t0
 ; RV64IA-NEXT:    sllw a6, a6, a0
@@ -1044,7 +1044,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; RV32I-NEXT:  .LBB6_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    sub a0, a3, s1
-; RV32I-NEXT:    sltu a1, a3, a0
+; RV32I-NEXT:    sltu a1, s1, a3
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a2, a1, a0
 ; RV32I-NEXT:    sw a3, 0(sp)
@@ -1075,7 +1075,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; RV32IA-NEXT:    # Child Loop BB6_3 Depth 2
 ; RV32IA-NEXT:    mv a3, a2
 ; RV32IA-NEXT:    sub a2, a2, a1
-; RV32IA-NEXT:    sltu a4, a3, a2
+; RV32IA-NEXT:    sltu a4, a1, a3
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    and a4, a4, a2
 ; RV32IA-NEXT:  .LBB6_3: # %atomicrmw.start
@@ -1298,7 +1298,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV64I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    sub a0, a3, s1
-; RV64I-NEXT:    sltu a1, a3, a0
+; RV64I-NEXT:    sltu a1, s1, a3
 ; RV64I-NEXT:    addi a1, a1, -1
 ; RV64I-NEXT:    and a2, a1, a0
 ; RV64I-NEXT:    sd a3, 0(sp)
@@ -1329,7 +1329,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV64IA-NEXT:    # Child Loop BB7_3 Depth 2
 ; RV64IA-NEXT:    mv a3, a2
 ; RV64IA-NEXT:    sub a2, a2, a1
-; RV64IA-NEXT:    sltu a4, a3, a2
+; RV64IA-NEXT:    sltu a4, a1, a3
 ; RV64IA-NEXT:    addi a4, a4, -1
 ; RV64IA-NEXT:    and a4, a4, a2
 ; RV64IA-NEXT:  .LBB7_3: # %atomicrmw.start
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 5b215c5173211..0fb4b2a06b76f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -519,7 +519,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -543,7 +543,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64_unmasked(<vscale x 16 x i64> %va, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 09b8fdbf11d26..025f944bcd51c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -3018,7 +3018,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-NEXT:    slli a3, a3, 2
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a4
 ; CHECK-NEXT:    sub a4, a0, a3
-; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    sltu a5, a3, a0
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a5, a5, a4
 ; CHECK-NEXT:    lui a6, 5
@@ -3079,7 +3079,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -3104,7 +3104,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
 ; CHECK-NEXT:    lui a2, 3
 ; CHECK-NEXT:    slli a3, a3, 2
 ; CHECK-NEXT:    sub a4, a0, a3
-; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    sltu a5, a3, a0
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a5, a5, a4
 ; CHECK-NEXT:    lui a6, 5
@@ -3160,7 +3160,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0177b8cfd4393..668a770610f20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1534,7 +1534,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -1561,7 +1561,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVKB-NEXT:    sub a2, a0, a1
-; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVKB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVKB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVKB-NEXT:    and a2, a3, a2
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -1584,7 +1584,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -1606,7 +1606,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
 ; CHECK-ZVKB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    sub a2, a0, a1
-; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVKB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVKB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVKB-NEXT:    and a2, a3, a2
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 6c7709f52e30b..d3813b703c5be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1585,7 +1585,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; RV32ZFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZFH-NEXT:    sub a2, a0, a1
 ; RV32ZFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZFH-NEXT:    sltu a3, a0, a2
+; RV32ZFH-NEXT:    sltu a3, a1, a0
 ; RV32ZFH-NEXT:    addi a3, a3, -1
 ; RV32ZFH-NEXT:    and a2, a3, a2
 ; RV32ZFH-NEXT:    vmv1r.v v0, v6
@@ -1631,7 +1631,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; RV64ZFH-NEXT:    sub a3, a0, a1
 ; RV64ZFH-NEXT:    slli a2, a2, 52
 ; RV64ZFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZFH-NEXT:    sltu a2, a0, a3
+; RV64ZFH-NEXT:    sltu a2, a1, a0
 ; RV64ZFH-NEXT:    addi a2, a2, -1
 ; RV64ZFH-NEXT:    and a2, a2, a3
 ; RV64ZFH-NEXT:    vmv1r.v v0, v6
@@ -1676,7 +1676,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; RV32ZFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZFH-NEXT:    sub a3, a0, a1
 ; RV32ZFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZFH-NEXT:    sltu a2, a0, a3
+; RV32ZFH-NEXT:    sltu a2, a1, a0
 ; RV32ZFH-NEXT:    addi a2, a2, -1
 ; RV32ZFH-NEXT:    and a2, a2, a3
 ; RV32ZFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1710,7 +1710,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; RV64ZFH-NEXT:    sub a3, a0, a1
 ; RV64ZFH-NEXT:    slli a2, a2, 52
 ; RV64ZFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZFH-NEXT:    sltu a2, a0, a3
+; RV64ZFH-NEXT:    sltu a2, a1, a0
 ; RV64ZFH-NEXT:    addi a2, a2, -1
 ; RV64ZFH-NEXT:    and a2, a2, a3
 ; RV64ZFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 20f397b694180..f8293f6c671f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -1195,7 +1195,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    sub a5, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    sltu a3, a0, a5
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a5, a3, a5
 ; CHECK-NEXT:    li a3, 1086
@@ -1228,7 +1228,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1252,7 +1252,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-NEXT:    fsrmi a4, 1
 ; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a5, a0, a3
+; CHECK-NEXT:    sltu a5, a1, a0
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a5, a5, a3
 ; CHECK-NEXT:    li a3, 1086
@@ -1280,7 +1280,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2402,7 +2402,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a4, a2, a4
 ; CHECK-NEXT:    li a2, 52
@@ -2433,7 +2433,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2456,7 +2456,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a4, a4, a2
 ; CHECK-NEXT:    li a2, 52
@@ -2482,7 +2482,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 1bbefc65d3e39..d16418f57033a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1990,7 +1990,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    sltu a2, a0, a3
+; RV32-NEXT:    sltu a2, a1, a0
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2117,10 +2117,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sub a6, a0, a1
+; RV64-NEXT:    sltu a1, a1, a0
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a6
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -2129,11 +2134,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
@@ -2144,9 +2144,9 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
 ; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
@@ -2158,7 +2158,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
 ; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64:
@@ -2169,7 +2169,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2200,10 +2200,10 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    sub a4, a0, a1
 ; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    sltu a2, a0, a4
+; RV32-NEXT:    sltu a2, a1, a0
+; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2308,10 +2308,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV64-NEXT:    lui a4, 209715
 ; RV64-NEXT:    lui a5, 61681
 ; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    sub a7, a0, a2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    addi a3, a3, 1365
 ; RV64-NEXT:    addi a4, a4, 819
 ; RV64-NEXT:    addi a5, a5, -241
 ; RV64-NEXT:    addi a6, a6, 257
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    slli a7, a3, 32
 ; RV64-NEXT:    add a3, a3, a7
 ; RV64-NEXT:    slli a7, a4, 32
@@ -2320,11 +2325,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV64-NEXT:    add a5, a5, a7
 ; RV64-NEXT:    slli a7, a6, 32
 ; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    sub a2, a0, a2
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2346,26 +2346,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a5
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vmul.vx v16, v16, a6
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsrl.vx v8, v8, a2
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v16, v16, a6
-; RV64-NEXT:    vsrl.vx v16, v16, a7
+; RV64-NEXT:    vsrl.vx v16, v16, a2
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index c82ad17545a6a..464c4d1f5f899 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2154,7 +2154,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    srli a2, a1, 3
 ; RV32-NEXT:    sub a3, a0, a1
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    sltu a2, a0, a3
+; RV32-NEXT:    sltu a2, a1, a0
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    lui a3, 349525
@@ -2190,31 +2190,31 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a3, a3, 257
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
@@ -2226,11 +2226,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vi v8, v16, -1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
@@ -2286,11 +2286,14 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    srli a6, a1, 3
 ; RV64-NEXT:    sub a7, a0, a1
+; RV64-NEXT:    vslidedown.vx v0, v0, a6
+; RV64-NEXT:    sltu a6, a1, a0
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi t0, a5, 257
-; RV64-NEXT:    vslidedown.vx v0, v0, a6
+; RV64-NEXT:    addi a6, a6, -1
+; RV64-NEXT:    and a7, a6, a7
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a6, a2, a6
 ; RV64-NEXT:    slli a5, a3, 32
@@ -2299,9 +2302,6 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    slli a3, t0, 32
 ; RV64-NEXT:    add a3, t0, a3
-; RV64-NEXT:    sltu a4, a0, a7
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a7, a4, a7
 ; RV64-NEXT:    li a4, 56
 ; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
@@ -2350,7 +2350,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2381,10 +2381,10 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    sub a4, a0, a1
 ; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    sltu a2, a0, a4
+; RV32-NEXT:    sltu a2, a1, a0
+; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2489,21 +2489,21 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    sub a6, a0, a1
+; RV64-NEXT:    sltu a7, a1, a0
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a7, a4, -241
-; RV64-NEXT:    addi t0, a5, 257
+; RV64-NEXT:    addi t0, a4, -241
+; RV64-NEXT:    addi t1, a5, 257
+; RV64-NEXT:    addi a7, a7, -1
+; RV64-NEXT:    and a6, a7, a6
 ; RV64-NEXT:    slli a5, a2, 32
 ; RV64-NEXT:    add a5, a2, a5
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a7, 32
-; RV64-NEXT:    add a2, a7, a2
-; RV64-NEXT:    slli a3, t0, 32
-; RV64-NEXT:    add a3, t0, a3
-; RV64-NEXT:    sltu a7, a0, a6
-; RV64-NEXT:    addi a7, a7, -1
-; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    slli a2, t0, 32
+; RV64-NEXT:    add a2, t0, a2
+; RV64-NEXT:    slli a3, t1, 32
+; RV64-NEXT:    add a3, t1, a3
 ; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1
 ; RV64-NEXT:    vnot.v v16, v16
@@ -2547,7 +2547,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -3731,7 +3731,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a4, a2, a4
 ; CHECK-NEXT:    li a2, 52
@@ -3766,7 +3766,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -3789,7 +3789,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a4, a4, a2
 ; CHECK-NEXT:    li a2, 52
@@ -3819,7 +3819,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
index fa81e1f6f3514..912a63b09f1a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
@@ -392,10 +392,10 @@ define <32 x i64> @vp_abs_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v8, 0, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v16, 0, v0.t
@@ -417,10 +417,10 @@ define <32 x i64> @vp_abs_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v8, 0
 ; CHECK-NEXT:    vmax.vv v8, v8, v24
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vmax.vv v16, v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index f436bbb9a66ca..8e322b64ef551 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -2386,10 +2386,10 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    lui a1, 1
 ; CHECK-NEXT:    lui a2, 3
-; CHECK-NEXT:    addi a3, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a3
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    sltiu a3, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a3, a3, a0
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    addi a1, a1, -241
@@ -2450,10 +2450,10 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    lui a3, 3
-; CHECK-NEXT:    addi a4, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a4
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    sltiu a4, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a4, a4
+; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    lui a4, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v24
 ; CHECK-NEXT:    addi a2, a2, -241
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index eca94ccb9bf7f..c1c9e581decf8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -1275,10 +1275,10 @@ define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a1, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
@@ -1302,10 +1302,10 @@ define <128 x i16> @vp_bswap_v128i16_unmasked(<128 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    addi a1, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 466d5d4b8e80a..b58de7abf0442 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 3
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 3
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 3
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 3
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 3
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 3
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 00c36cb7f7327..d1fadc962c2eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1979,10 +1979,10 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
@@ -2065,22 +2065,22 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a6, a4, -241
-; RV64-NEXT:    addi a7, a5, 257
+; RV64-NEXT:    addi a7, a4, -241
+; RV64-NEXT:    addi t0, a5, 257
+; RV64-NEXT:    neg a4, a6
+; RV64-NEXT:    and a6, a4, a0
 ; RV64-NEXT:    slli a5, a2, 32
 ; RV64-NEXT:    add a5, a2, a5
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a6, 32
-; RV64-NEXT:    add a2, a6, a2
-; RV64-NEXT:    slli a3, a7, 32
-; RV64-NEXT:    add a3, a7, a3
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
@@ -2150,9 +2150,9 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
@@ -2160,110 +2160,102 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsrl.vi v0, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 1
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vsrl.vi v0, v8, 8
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 4
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vsrl.vi v0, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 8
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vx v0, v16, a2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vor.vv v24, v16, v8
+; RV32-NEXT:    vnot.v v0, v16
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
+; RV32-NEXT:    vand.vv v0, v24, v16
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a2, a2, -241
 ; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
+; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2285,95 +2277,95 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    lui a4, 209715
 ; RV64-NEXT:    lui a5, 61681
 ; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addi a7, a3, 1365
-; RV64-NEXT:    addi a3, a4, 819
-; RV64-NEXT:    addi a4, a5, -241
-; RV64-NEXT:    addi a6, a6, 257
-; RV64-NEXT:    slli a5, a7, 32
-; RV64-NEXT:    add a7, a7, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a5, a3, a5
-; RV64-NEXT:    slli a3, a4, 32
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    slli a4, a6, 32
-; RV64-NEXT:    add a4, a6, a4
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    sltiu a7, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    addi a3, a3, 1365
+; RV64-NEXT:    addi a4, a4, 819
+; RV64-NEXT:    addi a5, a5, -241
+; RV64-NEXT:    addi t0, a6, 257
+; RV64-NEXT:    neg a6, a7
+; RV64-NEXT:    and a0, a6, a0
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a7, a3, a6
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a6, a4, a6
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    slli a4, t0, 32
+; RV64-NEXT:    add a4, t0, a4
+; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vsrl.vi v24, v8, 8
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vsrl.vi v24, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v16, a2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a5
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vand.vx v24, v8, a6
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a6
 ; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a6
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a6
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a0
+; RV64-NEXT:    vsrl.vx v16, v16, a5
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
@@ -4354,10 +4346,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
@@ -4440,22 +4432,22 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a6, a4, -241
-; RV64-NEXT:    addi a7, a5, 257
+; RV64-NEXT:    addi a7, a4, -241
+; RV64-NEXT:    addi t0, a5, 257
+; RV64-NEXT:    neg a4, a6
+; RV64-NEXT:    and a6, a4, a0
 ; RV64-NEXT:    slli a5, a2, 32
 ; RV64-NEXT:    add a5, a2, a5
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a6, 32
-; RV64-NEXT:    add a2, a6, a2
-; RV64-NEXT:    slli a3, a7, 32
-; RV64-NEXT:    add a3, a7, a3
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
@@ -4525,9 +4517,9 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
@@ -4535,110 +4527,102 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsrl.vi v0, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 1
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vsrl.vi v0, v8, 8
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 4
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vsrl.vi v0, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 8
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vx v0, v16, a2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vor.vv v24, v16, v8
+; RV32-NEXT:    vnot.v v0, v16
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
+; RV32-NEXT:    vand.vv v0, v24, v16
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a2, a2, -241
 ; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
+; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -4660,95 +4644,95 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:    lui a4, 209715
 ; RV64-NEXT:    lui a5, 61681
 ; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addi a7, a3, 1365
-; RV64-NEXT:    addi a3, a4, 819
-; RV64-NEXT:    addi a4, a5, -241
-; RV64-NEXT:    addi a6, a6, 257
-; RV64-NEXT:    slli a5, a7, 32
-; RV64-NEXT:    add a7, a7, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a5, a3, a5
-; RV64-NEXT:    slli a3, a4, 32
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    slli a4, a6, 32
-; RV64-NEXT:    add a4, a6, a4
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    sltiu a7, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    addi a3, a3, 1365
+; RV64-NEXT:    addi a4, a4, 819
+; RV64-NEXT:    addi a5, a5, -241
+; RV64-NEXT:    addi t0, a6, 257
+; RV64-NEXT:    neg a6, a7
+; RV64-NEXT:    and a0, a6, a0
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a7, a3, a6
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a6, a4, a6
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    slli a4, t0, 32
+; RV64-NEXT:    add a4, t0, a4
+; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vsrl.vi v24, v8, 8
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vsrl.vi v24, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v16, a2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a5
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vand.vx v24, v8, a6
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a6
 ; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a6
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a6
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a0
+; RV64-NEXT:    vsrl.vx v16, v16, a5
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index f56438bf87e6a..61bc86333d95f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1430,7 +1430,10 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, a2, 819
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
@@ -1455,24 +1458,21 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -1481,15 +1481,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1504,15 +1507,12 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -1541,10 +1541,14 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
 ; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    sltiu a5, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, 819
 ; RV64-NEXT:    addi a3, a3, -241
 ; RV64-NEXT:    addi a4, a4, 257
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a0, a5, a0
 ; RV64-NEXT:    slli a5, a1, 32
 ; RV64-NEXT:    add a1, a1, a5
 ; RV64-NEXT:    slli a5, a2, 32
@@ -1553,10 +1557,6 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    add a3, a3, a5
 ; RV64-NEXT:    slli a5, a4, 32
 ; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    addi a5, a0, -16
-; RV64-NEXT:    sltu a0, a0, a5
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a5
 ; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
@@ -1603,10 +1603,10 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a2, a2, 819
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1628,13 +1628,13 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
 ; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
@@ -1672,10 +1672,14 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    neg a6, a6
+; RV64-NEXT:    and a0, a6, a0
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -1684,10 +1688,6 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v8, v8, v24
@@ -1710,18 +1710,18 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsrl.vx v16, v16, a6
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 098384d200045..0e3eadcce484e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1604,10 +1604,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -1616,26 +1616,26 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1679,31 +1679,31 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
 ; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    sltiu a5, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a5, a3, -241
+; RV64-NEXT:    addi a3, a3, -241
 ; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a6, a1, a3
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a3, a2, a3
-; RV64-NEXT:    slli a1, a5, 32
-; RV64-NEXT:    add a1, a5, a1
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a5, a5, a0
+; RV64-NEXT:    slli a0, a1, 32
+; RV64-NEXT:    add a6, a1, a0
+; RV64-NEXT:    slli a0, a2, 32
+; RV64-NEXT:    add a7, a2, a0
+; RV64-NEXT:    slli a1, a3, 32
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    slli a2, a4, 32
 ; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    addi a4, a0, -16
-; RV64-NEXT:    sltu a0, a0, a4
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a4, a0, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a7, v0.t
 ; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
@@ -1711,16 +1711,16 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
 ; RV64-NEXT:    vnot.v v16, v16, v0.t
 ; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
 ; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
@@ -1744,9 +1744,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vi v24, v8, -1
 ; RV32-NEXT:    vnot.v v0, v8
@@ -1754,15 +1754,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v0, v24
@@ -1774,8 +1769,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vi v0, v16, -1
 ; RV32-NEXT:    vnot.v v16, v16
 ; RV32-NEXT:    vand.vv v0, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1783,16 +1780,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
@@ -1826,7 +1816,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1848,10 +1838,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    neg a6, a6
+; RV64-NEXT:    and a0, a6, a0
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -1860,47 +1854,43 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v24, v16, a3
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
 ; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsrl.vx v16, v16, a6
@@ -3509,10 +3499,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -3521,26 +3511,26 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -3584,31 +3574,31 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
 ; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    sltiu a5, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a5, a3, -241
+; RV64-NEXT:    addi a3, a3, -241
 ; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a6, a1, a3
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a3, a2, a3
-; RV64-NEXT:    slli a1, a5, 32
-; RV64-NEXT:    add a1, a5, a1
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a5, a5, a0
+; RV64-NEXT:    slli a0, a1, 32
+; RV64-NEXT:    add a6, a1, a0
+; RV64-NEXT:    slli a0, a2, 32
+; RV64-NEXT:    add a7, a2, a0
+; RV64-NEXT:    slli a1, a3, 32
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    slli a2, a4, 32
 ; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    addi a4, a0, -16
-; RV64-NEXT:    sltu a0, a0, a4
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a4, a0, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a7, v0.t
 ; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
@@ -3616,16 +3606,16 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
 ; RV64-NEXT:    vnot.v v16, v16, v0.t
 ; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
 ; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
@@ -3649,9 +3639,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vi v24, v8, -1
 ; RV32-NEXT:    vnot.v v0, v8
@@ -3659,15 +3649,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v0, v24
@@ -3679,8 +3664,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vadd.vi v0, v16, -1
 ; RV32-NEXT:    vnot.v v16, v16
 ; RV32-NEXT:    vand.vv v0, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -3688,16 +3675,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
@@ -3731,7 +3711,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3753,10 +3733,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    neg a6, a6
+; RV64-NEXT:    and a0, a6, a0
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -3765,47 +3749,43 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v24, v16, a3
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
 ; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsrl.vx v16, v16, a6
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 76f5f0a32bd1c..5a0749068b41d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 2
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 2
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 2
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 2
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 2
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 2
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 2
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 2
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index da6e2fae93687..ad7ee735707f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 }
 
 define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmax_vv_v32f64_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB25_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT:    vfmax.vv v16, v16, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.maximum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index e179970199171..9a5304e0d94e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 }
 
 define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmin_vv_v32f64_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB25_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT:    vfmin.vv v16, v16, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.minimum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
index 465b166826a37..6d87ecfd3bc6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
@@ -96,10 +96,10 @@ define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vmv1r.v v0, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
index 96eda109e1c70..044b9fefa1220 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
index 4020100bf364b..55f4d9e0805c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
index e509722b623a2..aab5bbdfebacd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
@@ -97,10 +97,10 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v24, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 57c94830fc606..e3ed908a5bddb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -741,10 +741,10 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    sltiu a1, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32-NEXT:    frflags a1
@@ -787,12 +787,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    li a1, 1075
 ; RV64-NEXT:    slli a1, a1, 52
 ; RV64-NEXT:    fmv.d.x fa5, a1
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a1, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a1, a1
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    frflags a1
 ; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -832,10 +832,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV32-NEXT:    vfabs.v v24, v8
 ; RV32-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    frflags a2
 ; RV32-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -870,10 +870,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV64-NEXT:    li a2, 1075
 ; RV64-NEXT:    slli a2, a2, 52
 ; RV64-NEXT:    fmv.d.x fa5, a2
-; RV64-NEXT:    addi a2, a0, -16
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    sltiu a2, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    frflags a2
 ; RV64-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index ca9b24e60e503..4e90727b6ebf1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -123,10 +123,10 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32
 ; CHECK-NEXT:    vfmv.s.f v25, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    addi a1, a0, -32
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 33
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v25, v16, v25, v0.t
@@ -151,10 +151,10 @@ define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m,
 ; CHECK-NEXT:    vfmv.s.f v25, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    addi a1, a0, -32
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 33
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 3e77020ed0213..27211f153b526 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -654,12 +654,12 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
 ; CHECK-NEXT:  .LBB49_2:
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v25, a0
-; CHECK-NEXT:    addi a0, a1, -32
+; CHECK-NEXT:    sltiu a0, a1, 33
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vredxor.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    sltu a1, a1, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a1, a1, -32
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vredxor.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
index 8523ca957a8f5..b5cd2e783ff66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
@@ -211,15 +211,15 @@ define zeroext i1 @vpreduce_and_v256i1(i1 zeroext %s, <256 x i1> %v, <256 x i1>
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB14_2:
 ; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    sltiu a3, a1, 129
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a1, a3, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmnot.m v9, v9
 ; CHECK-NEXT:    vcpop.m a2, v9, v0.t
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 7540495c0d3b5..41e8d1f982e32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -669,12 +669,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    sltiu a1, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a1, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    vmv1r.v v0, v6
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -711,12 +711,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV64-NEXT:    li a1, 1075
 ; RV64-NEXT:    slli a1, a1, 52
 ; RV64-NEXT:    fmv.d.x fa5, a1
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a1, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -752,10 +752,10 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV32-NEXT:    vfabs.v v24, v8
 ; RV32-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vfabs.v v24, v16
@@ -786,11 +786,11 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV64-NEXT:    li a2, 1075
 ; RV64-NEXT:    slli a2, a2, 52
 ; RV64-NEXT:    fmv.d.x fa5, a2
-; RV64-NEXT:    addi a2, a0, -16
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a2, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    vmflt.vf v0, v24, fa5
-; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vfabs.v v24, v16
 ; RV64-NEXT:    vmflt.vf v7, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index de5427f329496..2d4941744292e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 4
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 4
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 4
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 4
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 4
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 4
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 4
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 4
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 1c923e3f12171..45ea933f427ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 0
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 0
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 0
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 0
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 0
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 0
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 0
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 0
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 83cbd2b760341..3dc45f97e6964 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 1
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 1
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 1
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 1
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 1
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 1
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 1
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 1
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index af3e9db9fe123..79f1b88a765b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1076,10 +1076,10 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH-NEXT:  .LBB43_2:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v6, v8, v24, v0.t
-; ZVFH-NEXT:    addi a0, a2, -64
-; ZVFH-NEXT:    sltu a1, a2, a0
-; ZVFH-NEXT:    addi a1, a1, -1
-; ZVFH-NEXT:    and a0, a1, a0
+; ZVFH-NEXT:    sltiu a0, a2, 65
+; ZVFH-NEXT:    neg a0, a0
+; ZVFH-NEXT:    addi a1, a2, -64
+; ZVFH-NEXT:    and a0, a0, a1
 ; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -3929,10 +3929,10 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
 ; CHECK-NEXT:  .LBB87_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v6, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index efc0f7ef4a441..9f354d160d7c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -598,13 +598,13 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    addi a4, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    addi a2, a3, -128
+; CHECK-NEXT:    sltiu a2, a3, 129
 ; CHECK-NEXT:    vle8.v v24, (a4)
-; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    addi a4, a3, -128
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a2, a4, a2
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    neg a0, a2
+; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
@@ -636,10 +636,10 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB52_2
@@ -666,10 +666,10 @@ define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m,
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB53_2
@@ -1250,10 +1250,10 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; CHECK-NEXT:  .LBB99_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v6, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -32
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 33
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -32
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -1286,10 +1286,10 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB100_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT:    addi a2, a1, -32
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 33
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -32
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v8, v16, a0, v0.t
@@ -1316,10 +1316,10 @@ define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i
 ; CHECK-NEXT:  .LBB101_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT:    addi a2, a1, -32
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 33
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -32
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v8, v16, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
index a452e5a9ffbb8..9a08596ebb473 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vsext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vsext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
 ; CHECK-NEXT:  .LBB13_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v24, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
index afa8f2fda2ed4..8202ba4e2d815 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vsitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vsitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 8af4ced77be39..45c106240efc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -487,25 +487,24 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    li a4, 16
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB45_2
+; CHECK-NEXT:    sltiu a3, a2, 17
+; CHECK-NEXT:    addi a4, a2, -16
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    li a5, 16
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    bltu a2, a5, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:  .LBB45_2:
-; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    addi a5, a2, -16
+; CHECK-NEXT:    mul a4, a2, a1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 2
 ; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vlse64.v v16, (a4), a1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vlse64.v v8, (a0), a1, v0.t
 ; CHECK-NEXT:    ret
   %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %m, i32 %evl)
@@ -515,21 +514,20 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
 define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) nounwind {
 ; CHECK-LABEL: strided_vpload_v32f64_allones_mask:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a4, 16
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB46_2
+; CHECK-NEXT:    sltiu a3, a2, 17
+; CHECK-NEXT:    addi a4, a2, -16
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    li a5, 16
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    bltu a2, a5, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    addi a5, a2, -16
+; CHECK-NEXT:    mul a4, a2, a1
 ; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a5
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a4), a1
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vlse64.v v16, (a4), a1
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vlse64.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> splat (i1 true), i32 %evl)
@@ -549,10 +547,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    li a3, 32
 ; CHECK-RV32-NEXT:  .LBB47_2:
 ; CHECK-RV32-NEXT:    mul a6, a3, a2
-; CHECK-RV32-NEXT:    addi a5, a4, -32
-; CHECK-RV32-NEXT:    sltu a7, a4, a5
-; CHECK-RV32-NEXT:    addi a7, a7, -1
-; CHECK-RV32-NEXT:    and a7, a7, a5
+; CHECK-RV32-NEXT:    sltiu a5, a4, 33
+; CHECK-RV32-NEXT:    addi a7, a4, -32
+; CHECK-RV32-NEXT:    neg a5, a5
+; CHECK-RV32-NEXT:    and a7, a5, a7
 ; CHECK-RV32-NEXT:    li a5, 16
 ; CHECK-RV32-NEXT:    add a6, a1, a6
 ; CHECK-RV32-NEXT:    bltu a7, a5, .LBB47_4
@@ -563,10 +561,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    vslidedown.vi v0, v8, 4
 ; CHECK-RV32-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV32-NEXT:    addi a6, a3, -16
-; CHECK-RV32-NEXT:    sltu a3, a3, a6
-; CHECK-RV32-NEXT:    addi a3, a3, -1
-; CHECK-RV32-NEXT:    and a3, a3, a6
+; CHECK-RV32-NEXT:    sltiu a6, a3, 17
+; CHECK-RV32-NEXT:    neg a6, a6
+; CHECK-RV32-NEXT:    addi a3, a3, -16
+; CHECK-RV32-NEXT:    and a3, a6, a3
 ; CHECK-RV32-NEXT:    bltu a4, a5, .LBB47_6
 ; CHECK-RV32-NEXT:  # %bb.5:
 ; CHECK-RV32-NEXT:    li a4, 16
@@ -600,10 +598,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    li a4, 32
 ; CHECK-RV64-NEXT:  .LBB47_2:
 ; CHECK-RV64-NEXT:    mul a6, a4, a2
-; CHECK-RV64-NEXT:    addi a5, a3, -32
-; CHECK-RV64-NEXT:    sltu a7, a3, a5
-; CHECK-RV64-NEXT:    addi a7, a7, -1
-; CHECK-RV64-NEXT:    and a7, a7, a5
+; CHECK-RV64-NEXT:    sltiu a5, a3, 33
+; CHECK-RV64-NEXT:    addi a7, a3, -32
+; CHECK-RV64-NEXT:    neg a5, a5
+; CHECK-RV64-NEXT:    and a7, a5, a7
 ; CHECK-RV64-NEXT:    li a5, 16
 ; CHECK-RV64-NEXT:    add a6, a1, a6
 ; CHECK-RV64-NEXT:    bltu a7, a5, .LBB47_4
@@ -614,10 +612,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    vslidedown.vi v0, v8, 4
 ; CHECK-RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV64-NEXT:    addi a6, a4, -16
-; CHECK-RV64-NEXT:    sltu a4, a4, a6
-; CHECK-RV64-NEXT:    addi a4, a4, -1
-; CHECK-RV64-NEXT:    and a4, a4, a6
+; CHECK-RV64-NEXT:    sltiu a6, a4, 17
+; CHECK-RV64-NEXT:    neg a6, a6
+; CHECK-RV64-NEXT:    addi a4, a4, -16
+; CHECK-RV64-NEXT:    and a4, a6, a4
 ; CHECK-RV64-NEXT:    bltu a3, a5, .LBB47_6
 ; CHECK-RV64-NEXT:  # %bb.5:
 ; CHECK-RV64-NEXT:    li a3, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
index 25624ea0fcf6c..c7edae931a126 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
@@ -411,14 +411,14 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid
 ; CHECK-NEXT:  .LBB38_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    sltiu a4, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
 ; CHECK-NEXT:    mul a3, a3, a1
-; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    addi a3, a2, -16
-; CHECK-NEXT:    sltu a2, a2, a3
-; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    neg a4, a4
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1, v0.t
 ; CHECK-NEXT:    ret
@@ -437,12 +437,12 @@ define void @strided_store_v32f64_allones_mask(<32 x double> %v, ptr %ptr, i32 s
 ; CHECK-NEXT:  .LBB39_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    sltiu a4, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
 ; CHECK-NEXT:    mul a3, a3, a1
+; CHECK-NEXT:    neg a4, a4
+; CHECK-NEXT:    and a2, a4, a2
 ; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    addi a3, a2, -16
-; CHECK-NEXT:    sltu a2, a2, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index f992d1f8f7eee..f69a4ffde7910 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -56,10 +56,10 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero
 ; CHECK-NEXT:  .LBB4_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT:    addi a2, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a2
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    sltiu a2, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a0, a2, a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
@@ -214,79 +214,85 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vslidedown.vi v5, v0, 8
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v4, v0, 4
-; RV32-NEXT:    addi a2, a7, -64
-; RV32-NEXT:    vslidedown.vi v3, v5, 4
-; RV32-NEXT:    sltu a3, a7, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a4, a3, a2
-; RV32-NEXT:    addi a2, a4, -32
-; RV32-NEXT:    sltu a3, a4, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
+; RV32-NEXT:    sltiu a2, a7, 65
+; RV32-NEXT:    addi a3, a7, -64
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    and a4, a4, a3
+; RV32-NEXT:    sltiu a2, a4, 33
+; RV32-NEXT:    addi a3, a4, -32
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and t1, a2, a3
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    addi t0, a3, -16
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    bltu a3, a2, .LBB16_2
+; RV32-NEXT:    vslidedown.vi v3, v5, 4
+; RV32-NEXT:    mv a5, t1
+; RV32-NEXT:    bltu t1, a2, .LBB16_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a5, 16
 ; RV32-NEXT:  .LBB16_2:
-; RV32-NEXT:    li t2, 64
-; RV32-NEXT:    sltu t1, a3, t0
+; RV32-NEXT:    li t0, 64
+; RV32-NEXT:    sltiu a3, t1, 17
 ; RV32-NEXT:    mv a6, a7
-; RV32-NEXT:    bltu a7, t2, .LBB16_4
+; RV32-NEXT:    bltu a7, t0, .LBB16_4
 ; RV32-NEXT:  # %bb.3:
 ; RV32-NEXT:    li a6, 64
 ; RV32-NEXT:  .LBB16_4:
 ; RV32-NEXT:    addi t2, a1, 128
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v6, v4, 2
-; RV32-NEXT:    addi t6, a1, 512
-; RV32-NEXT:    addi t5, a1, 640
+; RV32-NEXT:    addi t5, a1, 512
+; RV32-NEXT:    addi t4, a1, 640
 ; RV32-NEXT:    vslidedown.vi v0, v3, 2
-; RV32-NEXT:    addi t1, t1, -1
+; RV32-NEXT:    neg t0, a3
+; RV32-NEXT:    addi t1, t1, -16
 ; RV32-NEXT:    addi t3, a1, 384
 ; RV32-NEXT:    vslidedown.vi v2, v5, 2
 ; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    addi t4, a6, -32
-; RV32-NEXT:    sltu a6, a6, t4
-; RV32-NEXT:    addi a6, a6, -1
-; RV32-NEXT:    and a6, a6, t4
-; RV32-NEXT:    addi t4, a6, -16
-; RV32-NEXT:    sltu s0, a6, t4
-; RV32-NEXT:    addi s0, s0, -1
+; RV32-NEXT:    sltiu t6, a6, 33
+; RV32-NEXT:    addi a6, a6, -32
+; RV32-NEXT:    neg t6, t6
+; RV32-NEXT:    and a6, t6, a6
+; RV32-NEXT:    sltiu t6, a6, 17
+; RV32-NEXT:    neg t6, t6
+; RV32-NEXT:    addi s0, a6, -16
 ; RV32-NEXT:    bltu a6, a2, .LBB16_6
 ; RV32-NEXT:  # %bb.5:
 ; RV32-NEXT:    li a6, 16
 ; RV32-NEXT:  .LBB16_6:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v8, (t6)
-; RV32-NEXT:    csrr t6, vlenb
+; RV32-NEXT:    vle64.v v8, (t5)
+; RV32-NEXT:    csrr t5, vlenb
 ; RV32-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    mul t6, t6, a0
+; RV32-NEXT:    mul t5, t5, a0
 ; RV32-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add t6, sp, t6
-; RV32-NEXT:    addi t6, t6, 16
-; RV32-NEXT:    vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vle64.v v8, (t5)
-; RV32-NEXT:    vle64.v v16, (t2)
+; RV32-NEXT:    add t5, sp, t5
+; RV32-NEXT:    addi t5, t5, 16
+; RV32-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vle64.v v16, (t4)
+; RV32-NEXT:    vle64.v v8, (t2)
+; RV32-NEXT:    csrr t2, vlenb
+; RV32-NEXT:    li t4, 40
+; RV32-NEXT:    mul t2, t2, t4
+; RV32-NEXT:    add t2, sp, t2
+; RV32-NEXT:    addi t2, t2, 16
+; RV32-NEXT:    vs8r.v v8, (t2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vle64.v v24, (a1)
 ; RV32-NEXT:    csrr t2, vlenb
-; RV32-NEXT:    li t5, 48
-; RV32-NEXT:    mul t2, t2, t5
+; RV32-NEXT:    li t4, 48
+; RV32-NEXT:    mul t2, t2, t4
 ; RV32-NEXT:    add t2, sp, t2
 ; RV32-NEXT:    addi t2, t2, 16
 ; RV32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -296,8 +302,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    add t2, sp, t2
 ; RV32-NEXT:    addi t2, t2, 16
 ; RV32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    and t2, t1, t0
-; RV32-NEXT:    and t1, s0, t4
+; RV32-NEXT:    and t2, t0, t1
+; RV32-NEXT:    and t1, t6, s0
 ; RV32-NEXT:    addi a1, a1, 256
 ; RV32-NEXT:    mv t0, a4
 ; RV32-NEXT:    bltu a4, a3, .LBB16_8
@@ -305,45 +311,45 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    li t0, 32
 ; RV32-NEXT:  .LBB16_8:
 ; RV32-NEXT:    vsetvli zero, t2, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v24, v8, 0, v0.t
-; RV32-NEXT:    csrr t2, vlenb
-; RV32-NEXT:    li t3, 24
-; RV32-NEXT:    mul t2, t2, t3
-; RV32-NEXT:    add t2, sp, t2
-; RV32-NEXT:    addi t2, t2, 16
-; RV32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr t2, vlenb
 ; RV32-NEXT:    li t3, 56
 ; RV32-NEXT:    mul t2, t2, t3
 ; RV32-NEXT:    add t2, sp, t2
 ; RV32-NEXT:    addi t2, t2, 16
-; RV32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v8, (t2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v8, v24, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v6
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    li t2, 40
+; RV32-NEXT:    mul a5, a5, t2
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, t1, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a5, t0, -16
-; RV32-NEXT:    sltu t0, t0, a5
-; RV32-NEXT:    addi t0, t0, -1
-; RV32-NEXT:    and a5, t0, a5
+; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    sltiu a5, t0, 17
+; RV32-NEXT:    addi t0, t0, -16
+; RV32-NEXT:    neg a5, a5
+; RV32-NEXT:    and a5, a5, t0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a1)
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v30, v7, 2
+; RV32-NEXT:    vslidedown.vi v28, v7, 2
 ; RV32-NEXT:    vmv1r.v v0, v4
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li t0, 48
@@ -364,9 +370,15 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    bltu a4, a2, .LBB16_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    li a4, 16
@@ -375,32 +387,33 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v24, v8, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    mv a1, a7
 ; RV32-NEXT:    bltu a7, a3, .LBB16_12
 ; RV32-NEXT:  # %bb.11:
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:  .LBB16_12:
-; RV32-NEXT:    vmv1r.v v0, v30
+; RV32-NEXT:    vmv1r.v v0, v28
+; RV32-NEXT:    vmv4r.v v8, v24
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -417,7 +430,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a4, a1, -16
+; RV32-NEXT:    sltiu a4, a1, 17
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    li a6, 56
 ; RV32-NEXT:    mul a5, a5, a6
@@ -438,7 +451,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 24
+; RV32-NEXT:    li a6, 40
 ; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
@@ -450,11 +463,12 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    sltu a1, a1, a4
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a4
+; RV32-NEXT:    neg a4, a4
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    and a1, a4, a1
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
@@ -466,35 +480,34 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:  .LBB16_14:
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a7, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v24, v16, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v24, 0, v0.t
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vslideup.vi v24, v8, 16
-; RV32-NEXT:    vse32.v v24, (a0)
-; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vslideup.vi v16, v8, 16
+; RV32-NEXT:    vse32.v v16, (a0)
+; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    addi a1, a0, 128
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 6
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    addi a0, a0, 384
+; RV32-NEXT:    addi a0, a0, 256
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
@@ -537,66 +550,66 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    vslidedown.vi v5, v0, 8
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v4, v0, 4
-; RV64-NEXT:    addi a2, a7, -64
-; RV64-NEXT:    vslidedown.vi v3, v5, 4
-; RV64-NEXT:    sltu a3, a7, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a4, a3, a2
-; RV64-NEXT:    addi a2, a4, -32
-; RV64-NEXT:    sltu a3, a4, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
+; RV64-NEXT:    sltiu a2, a7, 65
+; RV64-NEXT:    addi a3, a7, -64
+; RV64-NEXT:    neg a4, a2
+; RV64-NEXT:    and a4, a4, a3
+; RV64-NEXT:    sltiu a2, a4, 33
+; RV64-NEXT:    addi a3, a4, -32
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and t1, a2, a3
 ; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    addi t0, a3, -16
-; RV64-NEXT:    mv a5, a3
-; RV64-NEXT:    bltu a3, a2, .LBB16_2
+; RV64-NEXT:    vslidedown.vi v3, v5, 4
+; RV64-NEXT:    mv a5, t1
+; RV64-NEXT:    bltu t1, a2, .LBB16_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a5, 16
 ; RV64-NEXT:  .LBB16_2:
-; RV64-NEXT:    li t2, 64
-; RV64-NEXT:    sltu t1, a3, t0
+; RV64-NEXT:    li t0, 64
+; RV64-NEXT:    sltiu a3, t1, 17
 ; RV64-NEXT:    mv a6, a7
-; RV64-NEXT:    bltu a7, t2, .LBB16_4
+; RV64-NEXT:    bltu a7, t0, .LBB16_4
 ; RV64-NEXT:  # %bb.3:
 ; RV64-NEXT:    li a6, 64
 ; RV64-NEXT:  .LBB16_4:
 ; RV64-NEXT:    addi t2, a1, 128
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v6, v4, 2
-; RV64-NEXT:    addi t6, a1, 512
-; RV64-NEXT:    addi t5, a1, 640
+; RV64-NEXT:    addi t5, a1, 512
+; RV64-NEXT:    addi t4, a1, 640
 ; RV64-NEXT:    vslidedown.vi v0, v3, 2
-; RV64-NEXT:    addi t1, t1, -1
+; RV64-NEXT:    neg t0, a3
+; RV64-NEXT:    addi t1, t1, -16
 ; RV64-NEXT:    addi t3, a1, 384
 ; RV64-NEXT:    vslidedown.vi v2, v5, 2
 ; RV64-NEXT:    li a3, 32
-; RV64-NEXT:    addi t4, a6, -32
-; RV64-NEXT:    sltu a6, a6, t4
-; RV64-NEXT:    addi a6, a6, -1
-; RV64-NEXT:    and a6, a6, t4
-; RV64-NEXT:    addi t4, a6, -16
-; RV64-NEXT:    sltu s0, a6, t4
-; RV64-NEXT:    addi s0, s0, -1
+; RV64-NEXT:    sltiu t6, a6, 33
+; RV64-NEXT:    addi a6, a6, -32
+; RV64-NEXT:    neg t6, t6
+; RV64-NEXT:    and a6, t6, a6
+; RV64-NEXT:    sltiu t6, a6, 17
+; RV64-NEXT:    neg t6, t6
+; RV64-NEXT:    addi s0, a6, -16
 ; RV64-NEXT:    bltu a6, a2, .LBB16_6
 ; RV64-NEXT:  # %bb.5:
 ; RV64-NEXT:    li a6, 16
 ; RV64-NEXT:  .LBB16_6:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v8, (t6)
-; RV64-NEXT:    csrr t6, vlenb
+; RV64-NEXT:    vle64.v v8, (t5)
+; RV64-NEXT:    csrr t5, vlenb
 ; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    mul t6, t6, a0
+; RV64-NEXT:    mul t5, t5, a0
 ; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    add t6, sp, t6
-; RV64-NEXT:    addi t6, t6, 32
-; RV64-NEXT:    vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vle64.v v8, (t5)
+; RV64-NEXT:    add t5, sp, t5
+; RV64-NEXT:    addi t5, t5, 32
+; RV64-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vle64.v v8, (t4)
 ; RV64-NEXT:    vle64.v v16, (t2)
 ; RV64-NEXT:    vle64.v v24, (a1)
 ; RV64-NEXT:    csrr t2, vlenb
-; RV64-NEXT:    li t5, 48
-; RV64-NEXT:    mul t2, t2, t5
+; RV64-NEXT:    li t4, 48
+; RV64-NEXT:    mul t2, t2, t4
 ; RV64-NEXT:    add t2, sp, t2
 ; RV64-NEXT:    addi t2, t2, 32
 ; RV64-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -606,8 +619,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add t2, sp, t2
 ; RV64-NEXT:    addi t2, t2, 32
 ; RV64-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    and t2, t1, t0
-; RV64-NEXT:    and t1, s0, t4
+; RV64-NEXT:    and t2, t0, t1
+; RV64-NEXT:    and t1, t6, s0
 ; RV64-NEXT:    addi a1, a1, 256
 ; RV64-NEXT:    mv t0, a4
 ; RV64-NEXT:    bltu a4, a3, .LBB16_8
@@ -644,10 +657,10 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add a5, sp, a5
 ; RV64-NEXT:    addi a5, a5, 32
 ; RV64-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    addi a5, t0, -16
-; RV64-NEXT:    sltu t0, t0, a5
-; RV64-NEXT:    addi t0, t0, -1
-; RV64-NEXT:    and a5, t0, a5
+; RV64-NEXT:    sltiu a5, t0, 17
+; RV64-NEXT:    addi t0, t0, -16
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a5, a5, t0
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    addi a1, sp, 32
@@ -727,7 +740,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add a4, sp, a4
 ; RV64-NEXT:    addi a4, a4, 32
 ; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    addi a4, a1, -16
+; RV64-NEXT:    sltiu a4, a1, 17
 ; RV64-NEXT:    csrr a5, vlenb
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    mul a5, a5, a6
@@ -760,9 +773,9 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add a5, sp, a5
 ; RV64-NEXT:    addi a5, a5, 32
 ; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    sltu a1, a1, a4
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a1, a1, a4
+; RV64-NEXT:    neg a4, a4
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    and a1, a4, a1
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    slli a4, a4, 5
 ; RV64-NEXT:    add a4, sp, a4
@@ -786,17 +799,17 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vslideup.vi v24, v8, 16
 ; RV64-NEXT:    vse32.v v24, (a0)
-; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    li a3, 56
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 32
 ; RV64-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vse32.v v8, (a1)
-; RV64-NEXT:    addi a1, a0, 128
+; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    li a3, 48
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 32
@@ -837,10 +850,10 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext
 ; CHECK-NEXT:  .LBB17_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
index 3d1febe95421f..cde3f21947824 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vuitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vuitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 96dff2464e501..3fc3b47113a32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -354,10 +354,10 @@ define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltiu a0, a1, 129
+; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -383,10 +383,10 @@ define <256 x i8> @vadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
@@ -1328,10 +1328,10 @@ define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:  .LBB108_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
@@ -1351,10 +1351,10 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index da26c63b61e34..f2e051ee41ccb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -453,10 +453,10 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -488,10 +488,10 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v8, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v16, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
index 2774aba974a29..12c7009e43a44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
@@ -621,10 +621,10 @@ define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v16, v0.t
@@ -644,10 +644,10 @@ define <32 x double> @vfabs_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index f28b970f48ff7..e863e141376e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -855,10 +855,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -898,27 +898,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a2)
 ; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vle64.v v24, (a2)
 ; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
@@ -927,31 +921,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB51_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v8, v16
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vfmadd.vv v24, v8, v16
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index 403d0b8d57940..484389e29bed9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 56f7a8d48c5a1..92564e229bccc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index a9857880b5942..5298b186f2d25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -627,10 +627,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -670,27 +670,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a2)
 ; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vle64.v v24, (a2)
 ; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
@@ -699,31 +693,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB51_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v8, v16
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vfmadd.vv v24, v8, v16
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
index 84a89b23bc3b5..2b09bd9a22b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
@@ -589,10 +589,10 @@ define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v16, v16, v0.t
@@ -612,10 +612,10 @@ define <32 x double> @vfneg_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
index b431d4873fa1b..9f72f786591a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
@@ -361,10 +361,10 @@ define <32 x double> @vfsqrt_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zero
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
@@ -384,10 +384,10 @@ define <32 x double> @vfsqrt_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index f5978de080082..aa7c3d5e113d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmax_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 7450a70df66ba..3d6dc76d5e70d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vmaxu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index 31d19304c2909..5000bea58fa36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmin_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index dda69ec8a7d2e..42b05a295e50e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vminu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 3f5751aaa2cad..071a726604787 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -285,16 +285,16 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %
 ; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
-; RV64-NEXT:    addi a2, a1, -16
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
-; RV64-NEXT:    sltu a1, a1, a2
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    li a0, 32
@@ -1997,12 +1997,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV32-NEXT:  .LBB94_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v24, (zero), v8, v0.t
-; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltiu a1, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2020,12 +2020,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV64-NEXT:  .LBB94_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (zero), v8, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a1, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (zero), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2048,12 +2048,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV32-NEXT:  .LBB95_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2077,12 +2077,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV64-NEXT:  .LBB95_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2106,12 +2106,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:  .LBB96_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2136,12 +2136,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:  .LBB96_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2168,12 +2168,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2194,12 +2194,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2226,12 +2226,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2253,12 +2253,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV64-NEXT:  .LBB98_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2284,12 +2284,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2312,12 +2312,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:  .LBB99_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2344,12 +2344,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2370,12 +2370,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2399,12 +2399,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV32-NEXT:  .LBB101_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2427,12 +2427,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV64-NEXT:  .LBB101_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2455,12 +2455,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:  .LBB102_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2483,12 +2483,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:  .LBB102_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2512,12 +2512,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:  .LBB103_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2540,12 +2540,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:  .LBB103_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2575,12 +2575,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2598,12 +2598,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV64-NEXT:  .LBB104_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index d058669c103f3..8e50dffcaf31c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -325,12 +325,12 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB31_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
-; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    sltiu a2, a1, 17
+; CHECK-NEXT:    addi a1, a1, -16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0), v0.t
@@ -352,15 +352,15 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:  .LBB32_2:
-; CHECK-NEXT:    addi a5, a3, -16
+; CHECK-NEXT:    sltiu a5, a3, 17
+; CHECK-NEXT:    addi a3, a3, -16
 ; CHECK-NEXT:    addi a4, a1, 128
-; CHECK-NEXT:    addi a7, a2, -32
-; CHECK-NEXT:    sltu a3, a3, a5
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a6, a3, a5
-; CHECK-NEXT:    sltu a3, a2, a7
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a5, a3, a7
+; CHECK-NEXT:    sltiu a7, a2, 33
+; CHECK-NEXT:    neg a5, a5
+; CHECK-NEXT:    and a6, a5, a3
+; CHECK-NEXT:    addi a3, a2, -32
+; CHECK-NEXT:    neg a5, a7
+; CHECK-NEXT:    and a5, a5, a3
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v8, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index 0bacb5c26cb4a..3a36cda6dd04a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -1306,12 +1306,12 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
 ; CHECK-NEXT:  .LBB83_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
@@ -1339,12 +1339,12 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1>
 ; CHECK-NEXT:  .LBB84_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, tu, ma
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
 ; CHECK-NEXT:    vfmerge.vfm v16, v16, fa0, v0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index b4d20d93f2a1c..e509b390a3067 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1703,12 +1703,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV32-NEXT:  .LBB83_2:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
-; RV32-NEXT:    addi a0, a1, -16
+; RV32-NEXT:    sltiu a0, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a0
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1737,12 +1737,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV64-NEXT:  .LBB83_2:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
-; RV64-NEXT:    addi a0, a2, -16
-; RV64-NEXT:    sltu a1, a2, a0
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a0, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1773,12 +1773,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV32-NEXT:  .LBB84_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    sltiu a1, a2, 17
+; RV32-NEXT:    addi a2, a2, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a2, a2, a1
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1819,12 +1819,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT:    addi a1, a2, -16
-; RV64-NEXT:    sltu a2, a2, a1
-; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    sltiu a1, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
@@ -1859,12 +1859,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:  .LBB85_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    sltiu a1, a2, 17
+; RV32-NEXT:    addi a2, a2, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a2, a2, a1
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1905,12 +1905,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT:    addi a1, a2, -16
-; RV64-NEXT:    sltu a2, a2, a1
-; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    sltiu a1, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
@@ -1946,12 +1946,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:  .LBB86_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    sltiu a1, a2, 17
+; RV32-NEXT:    addi a2, a2, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a2, a2, a1
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1992,12 +1992,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT:    addi a1, a2, -16
-; RV64-NEXT:    sltu a2, a2, a1
-; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    sltiu a1, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index 855a87d21b7dc..b4e402caf5ba4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -255,12 +255,12 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
 ; CHECK-NEXT:  .LBB24_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
-; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    sltiu a2, a1, 17
+; CHECK-NEXT:    addi a1, a1, -16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index acaa1e6fa002d..495049e51fb64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -363,10 +363,10 @@ define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltiu a0, a1, 129
+; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -392,10 +392,10 @@ define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
@@ -1335,10 +1335,10 @@ define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:  .LBB108_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
@@ -1358,10 +1358,10 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index 9b3b8348d9b30..a5f57c24aaaaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -359,10 +359,10 @@ define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltiu a0, a1, 129
+; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -388,10 +388,10 @@ define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1
 ; CHECK-NEXT:    ret
@@ -1331,10 +1331,10 @@ define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:  .LBB108_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
@@ -1354,10 +1354,10 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index f2f9f90f386c0..e91477a622b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -143,15 +143,15 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
 ; CHECK-NEXT:    vmv1r.v v6, v8
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    addi a0, a3, -128
-; CHECK-NEXT:    vle8.v v8, (a4)
-; CHECK-NEXT:    sltu a4, a3, a0
+; CHECK-NEXT:    addi a0, a1, 128
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    sltiu a0, a3, 129
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a0, a4, a0
+; CHECK-NEXT:    addi a1, a3, -128
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
@@ -342,12 +342,12 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -511,12 +511,12 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    addi a0, a2, -32
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    sltiu a0, a2, 33
+; CHECK-NEXT:    addi a2, a2, -32
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 4
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 4c7d312e8e785..0947e39ce87e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -373,12 +373,12 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    sltiu a3, a1, 129
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    sltu a0, a1, a3
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a3, a3, a0
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
@@ -406,10 +406,10 @@ define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a2
 ; CHECK-NEXT:    ret
@@ -1376,10 +1376,10 @@ define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a2, v0.t
@@ -1400,10 +1400,10 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index f9000a1b88a6d..12fef2f06bfcf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -368,12 +368,12 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    sltiu a3, a1, 129
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    sltu a0, a1, a3
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a3, a3, a0
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
@@ -401,10 +401,10 @@ define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a2
 ; CHECK-NEXT:    ret
@@ -1371,10 +1371,10 @@ define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a2, v0.t
@@ -1395,10 +1395,10 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
index e2d9e0ac2deea..0bdbf1bb54074 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vzext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vzext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
 ; CHECK-NEXT:  .LBB13_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v24, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index e2deefa26ecb3..0ed12ddbb0f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 0e0c92b150d33..33ae7ca7d7847 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 86ed239e99373..173ea25335375 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index 736dd1225da88..cbccc96f43cbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -958,7 +958,7 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a6, a2, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a6)
-; CHECK-NEXT:    sltu a6, a4, a5
+; CHECK-NEXT:    sltu a6, a1, a4
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    srli a6, a1, 3
@@ -1059,7 +1059,7 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    add a3, a2, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a4, a6
+; CHECK-NEXT:    sltu a3, a1, a4
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a6, a3, a6
 ; CHECK-NEXT:    li a3, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
index c0a794afac3ae..c9478d65058f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -57,7 +57,7 @@ define <vscale x 16 x i64> @llrint_nxv16i64_nxv16f32(<vscale x 16 x float> %x, <
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
index c09df1a60d2ae..4136bab37bc9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -119,7 +119,7 @@ define <vscale x 16 x iXLen> @lrint_nxv16f32(<vscale x 16 x float> %x, <vscale x
 ; RV64-i64-NEXT:    srli a2, a1, 3
 ; RV64-i64-NEXT:    sub a3, a0, a1
 ; RV64-i64-NEXT:    vslidedown.vx v0, v0, a2
-; RV64-i64-NEXT:    sltu a2, a0, a3
+; RV64-i64-NEXT:    sltu a2, a1, a0
 ; RV64-i64-NEXT:    addi a2, a2, -1
 ; RV64-i64-NEXT:    and a2, a2, a3
 ; RV64-i64-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 67e7f7c7fbd42..236ba9096f4f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
index 1ee7e138654b9..3e9c669106a26 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
@@ -24274,7 +24274,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24302,7 +24302,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24335,7 +24335,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24363,7 +24363,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24396,7 +24396,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24424,7 +24424,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24457,7 +24457,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24485,7 +24485,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24517,7 +24517,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24545,7 +24545,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24586,10 +24586,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24614,10 +24614,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24647,10 +24647,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24675,10 +24675,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24708,10 +24708,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24736,10 +24736,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24769,10 +24769,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24797,10 +24797,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24829,10 +24829,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24857,10 +24857,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -25538,7 +25538,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25566,7 +25566,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25599,7 +25599,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25627,7 +25627,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25660,7 +25660,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25688,7 +25688,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25721,7 +25721,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25749,7 +25749,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25781,7 +25781,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25809,7 +25809,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25850,10 +25850,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25878,10 +25878,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25911,10 +25911,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25939,10 +25939,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25972,10 +25972,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26000,10 +26000,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26033,10 +26033,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26061,10 +26061,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26093,10 +26093,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26121,10 +26121,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26802,7 +26802,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26829,7 +26829,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26861,7 +26861,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26888,7 +26888,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26920,7 +26920,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26947,7 +26947,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26979,7 +26979,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27006,7 +27006,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27037,7 +27037,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27064,7 +27064,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27104,10 +27104,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27132,10 +27132,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27165,10 +27165,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27193,10 +27193,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27226,10 +27226,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27254,10 +27254,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27287,10 +27287,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27315,10 +27315,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27347,10 +27347,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27375,10 +27375,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28056,7 +28056,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28083,7 +28083,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28115,7 +28115,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28142,7 +28142,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28174,7 +28174,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28201,7 +28201,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28233,7 +28233,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28260,7 +28260,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28291,7 +28291,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28318,7 +28318,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28358,10 +28358,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28386,10 +28386,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28419,10 +28419,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28447,10 +28447,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28480,10 +28480,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28508,10 +28508,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28541,10 +28541,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28569,10 +28569,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28601,10 +28601,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28629,10 +28629,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -29322,12 +29322,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB850_2
@@ -29345,7 +29345,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB850_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29374,7 +29374,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29406,12 +29406,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB850_2
@@ -29429,7 +29429,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB850_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29458,7 +29458,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29495,12 +29495,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB851_2
@@ -29518,7 +29518,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB851_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29547,7 +29547,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29579,12 +29579,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB851_2
@@ -29602,7 +29602,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB851_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29631,7 +29631,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29668,12 +29668,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB852_2
@@ -29691,7 +29691,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB852_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29720,7 +29720,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29752,12 +29752,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB852_2
@@ -29775,7 +29775,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB852_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29804,7 +29804,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29841,12 +29841,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB853_2
@@ -29864,7 +29864,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB853_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29893,7 +29893,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29925,12 +29925,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB853_2
@@ -29948,7 +29948,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB853_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29977,7 +29977,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -30013,12 +30013,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB854_2
@@ -30036,7 +30036,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB854_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30065,7 +30065,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -30097,12 +30097,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB854_2
@@ -30120,7 +30120,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB854_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30149,7 +30149,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -30226,13 +30226,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30244,10 +30244,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30277,10 +30277,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30342,13 +30342,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30360,10 +30360,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30393,10 +30393,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30463,13 +30463,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30481,10 +30481,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30514,10 +30514,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30579,13 +30579,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30597,10 +30597,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30630,10 +30630,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30700,13 +30700,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30718,10 +30718,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30751,10 +30751,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30816,13 +30816,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30834,10 +30834,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30867,10 +30867,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30937,13 +30937,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30955,10 +30955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30988,10 +30988,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -31053,13 +31053,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -31071,10 +31071,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -31104,10 +31104,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -31173,13 +31173,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -31191,10 +31191,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -31224,10 +31224,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -31289,13 +31289,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -31307,10 +31307,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -31340,10 +31340,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -32026,12 +32026,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB880_2
@@ -32049,7 +32049,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB880_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32078,7 +32078,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32110,12 +32110,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB880_2
@@ -32133,7 +32133,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB880_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32162,7 +32162,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32199,12 +32199,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB881_2
@@ -32222,7 +32222,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB881_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32251,7 +32251,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32283,12 +32283,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB881_2
@@ -32306,7 +32306,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB881_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32335,7 +32335,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32372,12 +32372,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB882_2
@@ -32395,7 +32395,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB882_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32424,7 +32424,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32456,12 +32456,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB882_2
@@ -32479,7 +32479,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB882_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32508,7 +32508,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32545,12 +32545,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB883_2
@@ -32568,7 +32568,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB883_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32597,7 +32597,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32629,12 +32629,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB883_2
@@ -32652,7 +32652,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB883_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32681,7 +32681,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32717,12 +32717,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB884_2
@@ -32740,7 +32740,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB884_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32769,7 +32769,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32801,12 +32801,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB884_2
@@ -32824,7 +32824,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB884_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32853,7 +32853,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32907,13 +32907,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -32925,10 +32925,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32955,10 +32955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -32997,13 +32997,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33015,10 +33015,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33045,10 +33045,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33092,13 +33092,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33110,10 +33110,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33140,10 +33140,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33182,13 +33182,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33200,10 +33200,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33230,10 +33230,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33277,13 +33277,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33295,10 +33295,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33325,10 +33325,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33367,13 +33367,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33385,10 +33385,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33415,10 +33415,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33462,13 +33462,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33480,10 +33480,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33510,10 +33510,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33552,13 +33552,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33570,10 +33570,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33600,10 +33600,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33646,13 +33646,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33664,10 +33664,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33694,10 +33694,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33736,13 +33736,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33754,10 +33754,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33784,10 +33784,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -34527,30 +34527,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB910_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB910_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB910_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB910_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB910_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB910_6
@@ -34560,26 +34560,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB910_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -34589,11 +34589,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB910_10
@@ -34619,7 +34619,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB910_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -34674,45 +34674,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB910_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB910_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB910_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB910_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB910_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB910_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB910_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB910_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB910_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -34800,33 +34800,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB910_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB910_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB910_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB910_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB910_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB910_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -34834,7 +34834,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -34842,7 +34842,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -34853,21 +34853,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB910_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB910_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB910_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB910_10
@@ -34893,7 +34893,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB910_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -34948,45 +34948,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB910_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB910_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB910_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB910_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB910_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB910_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB910_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB910_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB910_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35080,30 +35080,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB911_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB911_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB911_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB911_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB911_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB911_6
@@ -35113,26 +35113,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB911_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -35142,11 +35142,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB911_10
@@ -35172,7 +35172,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB911_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -35227,45 +35227,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB911_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB911_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB911_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB911_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB911_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB911_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB911_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB911_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB911_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35353,33 +35353,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB911_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB911_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB911_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB911_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB911_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB911_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -35387,7 +35387,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35395,7 +35395,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35406,21 +35406,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB911_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB911_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB911_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB911_10
@@ -35446,7 +35446,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB911_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -35501,45 +35501,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB911_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB911_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB911_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB911_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB911_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB911_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB911_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB911_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB911_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35633,30 +35633,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB912_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB912_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB912_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB912_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB912_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB912_6
@@ -35666,26 +35666,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB912_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -35695,11 +35695,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB912_10
@@ -35725,7 +35725,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB912_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -35780,45 +35780,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB912_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB912_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB912_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB912_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB912_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB912_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB912_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB912_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB912_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35906,33 +35906,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB912_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB912_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB912_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB912_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB912_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB912_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -35940,7 +35940,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35948,7 +35948,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35959,21 +35959,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB912_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB912_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB912_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB912_10
@@ -35999,7 +35999,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB912_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -36054,45 +36054,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB912_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB912_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB912_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB912_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB912_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB912_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB912_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB912_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB912_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36186,30 +36186,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB913_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB913_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB913_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB913_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB913_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB913_6
@@ -36219,26 +36219,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB913_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -36248,11 +36248,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB913_10
@@ -36278,7 +36278,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB913_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -36333,45 +36333,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB913_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB913_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB913_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB913_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB913_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB913_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB913_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB913_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB913_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36459,33 +36459,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB913_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB913_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB913_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB913_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB913_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB913_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -36493,7 +36493,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -36501,7 +36501,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -36512,21 +36512,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB913_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB913_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB913_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB913_10
@@ -36552,7 +36552,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB913_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -36607,45 +36607,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB913_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB913_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB913_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB913_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB913_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB913_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB913_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB913_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB913_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36738,30 +36738,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB914_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB914_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB914_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB914_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB914_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB914_6
@@ -36771,26 +36771,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB914_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -36800,11 +36800,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB914_10
@@ -36830,7 +36830,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB914_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -36885,45 +36885,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB914_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB914_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB914_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB914_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB914_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB914_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB914_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB914_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB914_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37011,33 +37011,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB914_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB914_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB914_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB914_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB914_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB914_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -37045,7 +37045,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -37053,7 +37053,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -37064,21 +37064,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB914_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB914_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB914_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB914_10
@@ -37104,7 +37104,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB914_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -37159,45 +37159,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB914_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB914_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB914_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB914_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB914_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB914_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB914_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB914_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB914_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37342,9 +37342,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -37360,17 +37360,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -37391,23 +37391,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -37424,7 +37424,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -37493,13 +37493,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -37511,10 +37511,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -37609,102 +37609,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB915_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB915_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB915_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB915_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB915_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB915_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB915_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB915_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB915_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB915_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB915_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB915_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB915_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -37734,7 +37734,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -37803,13 +37803,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -37821,10 +37821,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -37967,9 +37967,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -37985,17 +37985,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -38016,23 +38016,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -38049,7 +38049,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -38118,13 +38118,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -38136,10 +38136,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -38234,102 +38234,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB916_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB916_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB916_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB916_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB916_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB916_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB916_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB916_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB916_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB916_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB916_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB916_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB916_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -38359,7 +38359,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -38428,13 +38428,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -38446,10 +38446,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -38592,9 +38592,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -38610,17 +38610,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -38641,23 +38641,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -38674,7 +38674,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -38743,13 +38743,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -38761,10 +38761,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -38859,102 +38859,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB917_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB917_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB917_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB917_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB917_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB917_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB917_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB917_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB917_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB917_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB917_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB917_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB917_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -38984,7 +38984,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -39053,13 +39053,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -39071,10 +39071,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -39217,9 +39217,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -39235,17 +39235,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -39266,23 +39266,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -39299,7 +39299,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -39368,13 +39368,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -39386,10 +39386,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -39484,102 +39484,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB918_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB918_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB918_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB918_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB918_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB918_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB918_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB918_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB918_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB918_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB918_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB918_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB918_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -39609,7 +39609,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -39678,13 +39678,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -39696,10 +39696,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -39841,9 +39841,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -39859,17 +39859,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -39890,23 +39890,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -39923,7 +39923,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -39992,13 +39992,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -40010,10 +40010,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -40108,102 +40108,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB919_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB919_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB919_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB919_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB919_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB919_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB919_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB919_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB919_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB919_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB919_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB919_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB919_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -40233,7 +40233,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -40302,13 +40302,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -40320,10 +40320,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 380287dd555c9..1c95c753c8ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -263,13 +263,13 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -321,14 +321,14 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -786,13 +786,13 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -858,14 +858,14 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1751,7 +1751,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1793,7 +1793,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1834,7 +1834,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV32ZVFMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFMIN-NEXT:    vmv1r.v v0, v6
@@ -1876,7 +1876,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -1917,7 +1917,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1947,7 +1947,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1976,7 +1976,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2006,7 +2006,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 37c036d38148a..605b07c81f45a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index 37a9ec1c0a8aa..6869bc2050698 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 5553b988fec97..8869a440c8634 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 634e58198def3..b67ab5c3c9efa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1337,211 +1337,404 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 }
 
 define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: fcmp_oeq_vv_nxv64bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv8r.v v0, v16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    slli a5, a3, 2
-; CHECK-NEXT:    slli a4, a3, 1
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    sub a6, a2, a5
-; CHECK-NEXT:    vl8re16.v v24, (a1)
-; CHECK-NEXT:    sltu a1, a2, a6
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a6, a1, a6
-; CHECK-NEXT:    sub a1, a6, a4
-; CHECK-NEXT:    sltu a7, a6, a1
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    and a7, a7, a1
-; CHECK-NEXT:    srli a1, a3, 1
-; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 1
-; CHECK-NEXT:    mv t1, t0
-; CHECK-NEXT:    slli t0, t0, 2
-; CHECK-NEXT:    add t1, t1, t0
-; CHECK-NEXT:    slli t0, t0, 1
-; CHECK-NEXT:    add t0, t0, t1
-; CHECK-NEXT:    add t0, sp, t0
-; CHECK-NEXT:    addi t0, t0, 16
-; CHECK-NEXT:    vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vslidedown.vx v16, v8, a1
-; CHECK-NEXT:    vl8re16.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv t0, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, t0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a3
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT:    bltu a6, a4, .LBB85_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a6, a4
-; CHECK-NEXT:  .LBB85_2:
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v7, v24, v16, v0.t
-; CHECK-NEXT:    bltu a2, a5, .LBB85_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a2, a5
-; CHECK-NEXT:  .LBB85_4:
-; CHECK-NEXT:    sub a0, a2, a4
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    mv a6, a5
-; CHECK-NEXT:    slli a5, a5, 2
-; CHECK-NEXT:    add a6, a6, a5
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a3
-; CHECK-NEXT:    sltu a5, a2, a0
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a0, a5, a0
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    mv a6, a5
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v10, v16, v24, v0.t
-; CHECK-NEXT:    vmv1r.v v9, v7
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v5, a3
-; CHECK-NEXT:    bltu a2, a4, .LBB85_6
-; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    mv a2, a4
-; CHECK-NEXT:  .LBB85_6:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a4, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; CHECK32-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    addi sp, sp, -16
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    mv a3, a1
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    add a3, a3, a1
+; CHECK32-NEXT:    slli a1, a1, 2
+; CHECK32-NEXT:    add a3, a3, a1
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    add a1, a1, a3
+; CHECK32-NEXT:    sub sp, sp, a1
+; CHECK32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    mv a3, a1
+; CHECK32-NEXT:    slli a1, a1, 2
+; CHECK32-NEXT:    add a3, a3, a1
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    add a1, a1, a3
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK32-NEXT:    vmv8r.v v0, v16
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    mv a3, a1
+; CHECK32-NEXT:    slli a1, a1, 3
+; CHECK32-NEXT:    add a1, a1, a3
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    csrr a3, vlenb
+; CHECK32-NEXT:    srli a1, a3, 1
+; CHECK32-NEXT:    slli a4, a3, 3
+; CHECK32-NEXT:    slli a6, a3, 2
+; CHECK32-NEXT:    slli a5, a3, 1
+; CHECK32-NEXT:    add a4, a0, a4
+; CHECK32-NEXT:    sub a7, a2, a6
+; CHECK32-NEXT:    sltu t0, a6, a2
+; CHECK32-NEXT:    vl8re16.v v24, (a4)
+; CHECK32-NEXT:    addi t0, t0, -1
+; CHECK32-NEXT:    and a7, t0, a7
+; CHECK32-NEXT:    sub a4, a7, a5
+; CHECK32-NEXT:    sltu t0, a5, a7
+; CHECK32-NEXT:    addi t0, t0, -1
+; CHECK32-NEXT:    and t0, t0, a4
+; CHECK32-NEXT:    srli a4, a3, 2
+; CHECK32-NEXT:    csrr t1, vlenb
+; CHECK32-NEXT:    slli t1, t1, 1
+; CHECK32-NEXT:    mv t2, t1
+; CHECK32-NEXT:    slli t1, t1, 2
+; CHECK32-NEXT:    add t2, t2, t1
+; CHECK32-NEXT:    slli t1, t1, 1
+; CHECK32-NEXT:    add t1, t1, t2
+; CHECK32-NEXT:    add t1, sp, t1
+; CHECK32-NEXT:    addi t1, t1, 16
+; CHECK32-NEXT:    vl1r.v v8, (t1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vslidedown.vx v16, v8, a1
+; CHECK32-NEXT:    vl8re16.v v8, (a0)
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv t1, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a0, a0, t1
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslidedown.vx v8, v16, a4
+; CHECK32-NEXT:    addi a0, sp, 16
+; CHECK32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    vsetvli zero, t0, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; CHECK32-NEXT:    bltu a7, a5, .LBB85_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    mv a7, a5
+; CHECK32-NEXT:  .LBB85_2:
+; CHECK32-NEXT:    addi a0, sp, 16
+; CHECK32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v5, v8, v16, v0.t
+; CHECK32-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v7, v24, v16, v0.t
+; CHECK32-NEXT:    bltu a2, a6, .LBB85_4
+; CHECK32-NEXT:  # %bb.3:
+; CHECK32-NEXT:    mv a2, a6
+; CHECK32-NEXT:  .LBB85_4:
+; CHECK32-NEXT:    sub a0, a2, a5
+; CHECK32-NEXT:    sltu a6, a5, a2
+; CHECK32-NEXT:    csrr a7, vlenb
+; CHECK32-NEXT:    slli a7, a7, 1
+; CHECK32-NEXT:    mv t0, a7
+; CHECK32-NEXT:    slli a7, a7, 2
+; CHECK32-NEXT:    add t0, t0, a7
+; CHECK32-NEXT:    slli a7, a7, 1
+; CHECK32-NEXT:    add a7, a7, t0
+; CHECK32-NEXT:    add a7, sp, a7
+; CHECK32-NEXT:    addi a7, a7, 16
+; CHECK32-NEXT:    vl1r.v v8, (a7) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslidedown.vx v0, v8, a4
+; CHECK32-NEXT:    addi a6, a6, -1
+; CHECK32-NEXT:    and a0, a6, a0
+; CHECK32-NEXT:    csrr a6, vlenb
+; CHECK32-NEXT:    slli a6, a6, 1
+; CHECK32-NEXT:    mv a7, a6
+; CHECK32-NEXT:    slli a6, a6, 3
+; CHECK32-NEXT:    add a6, a6, a7
+; CHECK32-NEXT:    add a6, sp, a6
+; CHECK32-NEXT:    addi a6, a6, 16
+; CHECK32-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a6, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a0, a0, a6
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v10, v16, v24, v0.t
+; CHECK32-NEXT:    vmv1r.v v9, v7
+; CHECK32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslideup.vx v9, v5, a4
+; CHECK32-NEXT:    bltu a2, a5, .LBB85_6
+; CHECK32-NEXT:  # %bb.5:
+; CHECK32-NEXT:    mv a2, a5
+; CHECK32-NEXT:  .LBB85_6:
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a5, a0
+; CHECK32-NEXT:    slli a0, a0, 3
+; CHECK32-NEXT:    add a0, a0, a5
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a2, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a0, a0, a2
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a2, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a2, a2, a0
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, a0, a2
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslideup.vx v8, v10, a4
+; CHECK32-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK32-NEXT:    vslideup.vx v8, v9, a1
+; CHECK32-NEXT:    vmv.v.v v0, v8
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    mv a1, a0
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a1, a1, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a1, a1, a0
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, a0, a1
+; CHECK32-NEXT:    add sp, sp, a0
+; CHECK32-NEXT:    .cfi_def_cfa sp, 16
+; CHECK32-NEXT:    addi sp, sp, 16
+; CHECK32-NEXT:    .cfi_def_cfa_offset 0
+; CHECK32-NEXT:    ret
+;
+; CHECK64-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    addi sp, sp, -16
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    mv a3, a1
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    add a3, a3, a1
+; CHECK64-NEXT:    slli a1, a1, 2
+; CHECK64-NEXT:    add a3, a3, a1
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    add a1, a1, a3
+; CHECK64-NEXT:    sub sp, sp, a1
+; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    mv a3, a1
+; CHECK64-NEXT:    slli a1, a1, 3
+; CHECK64-NEXT:    add a1, a1, a3
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK64-NEXT:    vmv8r.v v0, v16
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    mv a3, a1
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    add a3, a3, a1
+; CHECK64-NEXT:    slli a1, a1, 3
+; CHECK64-NEXT:    add a1, a1, a3
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    csrr a3, vlenb
+; CHECK64-NEXT:    slli a1, a3, 3
+; CHECK64-NEXT:    slli a5, a3, 2
+; CHECK64-NEXT:    slli a4, a3, 1
+; CHECK64-NEXT:    add a1, a0, a1
+; CHECK64-NEXT:    sub a6, a2, a5
+; CHECK64-NEXT:    sltu a7, a5, a2
+; CHECK64-NEXT:    vl8re16.v v24, (a1)
+; CHECK64-NEXT:    addi a7, a7, -1
+; CHECK64-NEXT:    and a6, a7, a6
+; CHECK64-NEXT:    sub a1, a6, a4
+; CHECK64-NEXT:    sltu a7, a4, a6
+; CHECK64-NEXT:    addi a7, a7, -1
+; CHECK64-NEXT:    and a7, a7, a1
+; CHECK64-NEXT:    srli a1, a3, 1
+; CHECK64-NEXT:    srli a3, a3, 2
+; CHECK64-NEXT:    csrr t0, vlenb
+; CHECK64-NEXT:    slli t0, t0, 1
+; CHECK64-NEXT:    mv t1, t0
+; CHECK64-NEXT:    slli t0, t0, 3
+; CHECK64-NEXT:    add t0, t0, t1
+; CHECK64-NEXT:    add t0, sp, t0
+; CHECK64-NEXT:    addi t0, t0, 16
+; CHECK64-NEXT:    vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vslidedown.vx v16, v8, a1
+; CHECK64-NEXT:    vl8re16.v v8, (a0)
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv t0, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a0, a0, t0
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslidedown.vx v8, v16, a3
+; CHECK64-NEXT:    addi a0, sp, 16
+; CHECK64-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; CHECK64-NEXT:    bltu a6, a4, .LBB85_2
+; CHECK64-NEXT:  # %bb.1:
+; CHECK64-NEXT:    mv a6, a4
+; CHECK64-NEXT:  .LBB85_2:
+; CHECK64-NEXT:    addi a0, sp, 16
+; CHECK64-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v5, v8, v16, v0.t
+; CHECK64-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v6, v24, v16, v0.t
+; CHECK64-NEXT:    bltu a2, a5, .LBB85_4
+; CHECK64-NEXT:  # %bb.3:
+; CHECK64-NEXT:    mv a2, a5
+; CHECK64-NEXT:  .LBB85_4:
+; CHECK64-NEXT:    sub a0, a2, a4
+; CHECK64-NEXT:    sltu a5, a4, a2
+; CHECK64-NEXT:    csrr a6, vlenb
+; CHECK64-NEXT:    slli a6, a6, 1
+; CHECK64-NEXT:    mv a7, a6
+; CHECK64-NEXT:    slli a6, a6, 3
+; CHECK64-NEXT:    add a6, a6, a7
+; CHECK64-NEXT:    add a6, sp, a6
+; CHECK64-NEXT:    addi a6, a6, 16
+; CHECK64-NEXT:    vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslidedown.vx v0, v7, a3
+; CHECK64-NEXT:    addi a5, a5, -1
+; CHECK64-NEXT:    and a0, a5, a0
+; CHECK64-NEXT:    csrr a5, vlenb
+; CHECK64-NEXT:    mv a6, a5
+; CHECK64-NEXT:    slli a5, a5, 1
+; CHECK64-NEXT:    add a6, a6, a5
+; CHECK64-NEXT:    slli a5, a5, 3
+; CHECK64-NEXT:    add a5, a5, a6
+; CHECK64-NEXT:    add a5, sp, a5
+; CHECK64-NEXT:    addi a5, a5, 16
+; CHECK64-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv a5, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a0, a0, a5
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v4, v16, v24, v0.t
+; CHECK64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslideup.vx v6, v5, a3
+; CHECK64-NEXT:    bltu a2, a4, .LBB85_6
+; CHECK64-NEXT:  # %bb.5:
+; CHECK64-NEXT:    mv a2, a4
+; CHECK64-NEXT:  .LBB85_6:
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    mv a4, a0
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a4, a4, a0
+; CHECK64-NEXT:    slli a0, a0, 3
+; CHECK64-NEXT:    add a0, a0, a4
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv a2, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a0, a0, a2
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT:    vmv1r.v v0, v7
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslideup.vx v8, v4, a3
+; CHECK64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK64-NEXT:    vslideup.vx v8, v6, a1
+; CHECK64-NEXT:    vmv.v.v v0, v8
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    mv a1, a0
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a1, a1, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a1, a1, a0
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a0, a0, a1
+; CHECK64-NEXT:    add sp, sp, a0
+; CHECK64-NEXT:    .cfi_def_cfa sp, 16
+; CHECK64-NEXT:    addi sp, sp, 16
+; CHECK64-NEXT:    .cfi_def_cfa_offset 0
+; CHECK64-NEXT:    ret
   %v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i1> %v
 }
@@ -3479,257 +3672,6 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 }
 
 define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    addi a1, sp, 16
-; ZVFH-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    csrr a3, vlenb
-; ZVFH-NEXT:    srli a1, a3, 1
-; ZVFH-NEXT:    slli a4, a3, 3
-; ZVFH-NEXT:    slli a3, a3, 2
-; ZVFH-NEXT:    add a4, a0, a4
-; ZVFH-NEXT:    sub a5, a2, a3
-; ZVFH-NEXT:    vl8re16.v v24, (a4)
-; ZVFH-NEXT:    sltu a4, a2, a5
-; ZVFH-NEXT:    addi a4, a4, -1
-; ZVFH-NEXT:    vl8re16.v v8, (a0)
-; ZVFH-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFH-NEXT:    and a4, a4, a5
-; ZVFH-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v6, v16, v24, v0.t
-; ZVFH-NEXT:    bltu a2, a3, .LBB171_2
-; ZVFH-NEXT:  # %bb.1:
-; ZVFH-NEXT:    mv a2, a3
-; ZVFH-NEXT:  .LBB171_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v16, v24, v8, v0.t
-; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vslideup.vx v16, v6, a1
-; ZVFH-NEXT:    vmv.v.v v0, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a3, a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a3, a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a3, a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a1, a3, 3
-; ZVFHMIN-NEXT:    slli a5, a3, 2
-; ZVFHMIN-NEXT:    slli a4, a3, 1
-; ZVFHMIN-NEXT:    add a1, a0, a1
-; ZVFHMIN-NEXT:    sub a6, a2, a5
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a1)
-; ZVFHMIN-NEXT:    sltu a1, a2, a6
-; ZVFHMIN-NEXT:    addi a1, a1, -1
-; ZVFHMIN-NEXT:    and a6, a1, a6
-; ZVFHMIN-NEXT:    sub a1, a6, a4
-; ZVFHMIN-NEXT:    sltu a7, a6, a1
-; ZVFHMIN-NEXT:    addi a7, a7, -1
-; ZVFHMIN-NEXT:    and a7, a7, a1
-; ZVFHMIN-NEXT:    srli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    csrr t0, vlenb
-; ZVFHMIN-NEXT:    slli t0, t0, 1
-; ZVFHMIN-NEXT:    mv t1, t0
-; ZVFHMIN-NEXT:    slli t0, t0, 2
-; ZVFHMIN-NEXT:    add t1, t1, t0
-; ZVFHMIN-NEXT:    slli t0, t0, 1
-; ZVFHMIN-NEXT:    add t0, t0, t1
-; ZVFHMIN-NEXT:    add t0, sp, t0
-; ZVFHMIN-NEXT:    addi t0, t0, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v8, a1
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv t0, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, t0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v16, a3
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a6, a4
-; ZVFHMIN-NEXT:  .LBB171_2:
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
-; ZVFHMIN-NEXT:  # %bb.3:
-; ZVFHMIN-NEXT:    mv a2, a5
-; ZVFHMIN-NEXT:  .LBB171_4:
-; ZVFHMIN-NEXT:    sub a0, a2, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    mv a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 2
-; ZVFHMIN-NEXT:    add a6, a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a5, a2, a0
-; ZVFHMIN-NEXT:    addi a5, a5, -1
-; ZVFHMIN-NEXT:    and a0, a5, a0
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    mv a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 3
-; ZVFHMIN-NEXT:    add a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a5, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v10, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v9, v7
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v9, v5, a3
-; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
-; ZVFHMIN-NEXT:  # %bb.5:
-; ZVFHMIN-NEXT:    mv a2, a4
-; ZVFHMIN-NEXT:  .LBB171_6:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a4, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, a0, a4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v10, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i1> %v
 }
@@ -4879,7 +4821,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    add a4, s3, a6
 ; CHECK32-NEXT:    vl8re64.v v24, (s3)
 ; CHECK32-NEXT:    sub a6, a3, s0
-; CHECK32-NEXT:    sltu a7, a3, a6
+; CHECK32-NEXT:    sltu a7, s0, a3
 ; CHECK32-NEXT:    addi a7, a7, -1
 ; CHECK32-NEXT:    and a6, a7, a6
 ; CHECK32-NEXT:    csrr a7, vlenb
@@ -4919,7 +4861,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK32-NEXT:    vl8re64.v v16, (a4)
 ; CHECK32-NEXT:    sub a1, s1, a2
-; CHECK32-NEXT:    sltu a2, s1, a1
+; CHECK32-NEXT:    sltu a2, a2, s1
 ; CHECK32-NEXT:    vl8re64.v v24, (s2)
 ; CHECK32-NEXT:    addi a2, a2, -1
 ; CHECK32-NEXT:    and s1, a2, a1
@@ -4964,7 +4906,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK32-NEXT:    vslideup.vx v9, v8, s4
 ; CHECK32-NEXT:    sub a1, s1, s0
-; CHECK32-NEXT:    sltu a2, s1, a1
+; CHECK32-NEXT:    sltu a2, s0, s1
 ; CHECK32-NEXT:    addi a2, a2, -1
 ; CHECK32-NEXT:    and a1, a2, a1
 ; CHECK32-NEXT:    csrr a2, vlenb
@@ -4979,7 +4921,8 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; CHECK32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK32-NEXT:    vmfeq.vv v8, v24, v16, v0.t
-; CHECK32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    srli s0, s0, 1
+; CHECK32-NEXT:    vsetvli zero, s0, e8, mf2, ta, ma
 ; CHECK32-NEXT:    vslideup.vx v9, v8, a0
 ; CHECK32-NEXT:    vmv1r.v v0, v9
 ; CHECK32-NEXT:    csrr a0, vlenb
@@ -5090,7 +5033,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK64-NEXT:    add a4, s3, a6
 ; CHECK64-NEXT:    vl8re64.v v24, (s3)
 ; CHECK64-NEXT:    sub a6, a3, s0
-; CHECK64-NEXT:    sltu a7, a3, a6
+; CHECK64-NEXT:    sltu a7, s0, a3
 ; CHECK64-NEXT:    addi a7, a7, -1
 ; CHECK64-NEXT:    and a6, a7, a6
 ; CHECK64-NEXT:    csrr a7, vlenb
@@ -5130,7 +5073,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK64-NEXT:    vl8re64.v v16, (a4)
 ; CHECK64-NEXT:    sub a1, s1, a2
-; CHECK64-NEXT:    sltu a2, s1, a1
+; CHECK64-NEXT:    sltu a2, a2, s1
 ; CHECK64-NEXT:    vl8re64.v v24, (s2)
 ; CHECK64-NEXT:    addi a2, a2, -1
 ; CHECK64-NEXT:    and s1, a2, a1
@@ -5175,7 +5118,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK64-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK64-NEXT:    vslideup.vx v9, v8, s4
 ; CHECK64-NEXT:    sub a1, s1, s0
-; CHECK64-NEXT:    sltu a2, s1, a1
+; CHECK64-NEXT:    sltu a2, s0, s1
 ; CHECK64-NEXT:    addi a2, a2, -1
 ; CHECK64-NEXT:    and a1, a2, a1
 ; CHECK64-NEXT:    csrr a2, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index c1de57bf850ac..829a3b43bd984 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1083,7 +1083,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    sub a4, a3, a1
 ; CHECK-NEXT:    vl8r.v v24, (a2)
-; CHECK-NEXT:    sltu a2, a3, a4
+; CHECK-NEXT:    sltu a2, a1, a3
 ; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
@@ -1120,7 +1120,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -1150,7 +1150,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_swap_nxv128i8(<vscale x 128 x i8> %va, i8
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -2195,81 +2195,155 @@ define <vscale x 8 x i1> @icmp_sle_vi_swap_nxv8i32(<vscale x 8 x i32> %va, <vsca
 }
 
 define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vv_nxv32i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    srli a1, a3, 2
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    sub a5, a2, a3
-; CHECK-NEXT:    vl8re32.v v24, (a4)
-; CHECK-NEXT:    sltu a4, a2, a5
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vl8re32.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    and a4, a4, a5
-; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
-; CHECK-NEXT:    bltu a2, a3, .LBB189_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a2, a3
-; CHECK-NEXT:  .LBB189_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v6, a1
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; RV32-LABEL: icmp_eq_vv_nxv32i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v7, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    srli a1, a3, 2
+; RV32-NEXT:    slli a5, a3, 3
+; RV32-NEXT:    slli a4, a3, 1
+; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    sub a6, a2, a4
+; RV32-NEXT:    vl8re32.v v24, (a5)
+; RV32-NEXT:    sltu a5, a4, a2
+; RV32-NEXT:    addi a5, a5, -1
+; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    vslidedown.vx v0, v0, a1
+; RV32-NEXT:    and a0, a5, a6
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vv v6, v16, v24, v0.t
+; RV32-NEXT:    bltu a2, a4, .LBB189_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:  .LBB189_2:
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vv v16, v24, v8, v0.t
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v16, v6, a1
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: icmp_eq_vv_nxv32i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v7, v0
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    srli a1, a3, 2
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    slli a3, a3, 1
+; RV64-NEXT:    add a4, a0, a4
+; RV64-NEXT:    sub a5, a2, a3
+; RV64-NEXT:    vl8re32.v v24, (a4)
+; RV64-NEXT:    sltu a4, a3, a2
+; RV64-NEXT:    addi a4, a4, -1
+; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
+; RV64-NEXT:    and a4, a4, a5
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vv v6, v16, v24, v0.t
+; RV64-NEXT:    bltu a2, a3, .LBB189_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB189_2:
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v16, v6, a1
+; RV64-NEXT:    vmv1r.v v0, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
   %v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i1> %v
 }
 
 define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_nxv32i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    srli a2, a3, 2
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a4, a1, a3
-; CHECK-NEXT:    sltu a5, a1, a4
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT:    bltu a1, a3, .LBB190_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB190_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v25, a2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    ret
+; RV32-LABEL: icmp_eq_vx_nxv32i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    srli a2, a3, 2
+; RV32-NEXT:    slli a4, a3, 1
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
+; RV32-NEXT:    sub a5, a1, a4
+; RV32-NEXT:    sltu a6, a4, a1
+; RV32-NEXT:    addi a6, a6, -1
+; RV32-NEXT:    and a5, a6, a5
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT:    bltu a1, a4, .LBB190_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a1, a4
+; RV32-NEXT:  .LBB190_2:
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v16, v25, a2
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: icmp_eq_vx_nxv32i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v24, v0
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    srli a2, a3, 2
+; RV64-NEXT:    slli a3, a3, 1
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
+; RV64-NEXT:    sub a4, a1, a3
+; RV64-NEXT:    sltu a5, a3, a1
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    and a4, a5, a4
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT:    bltu a1, a3, .LBB190_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a1, a3
+; RV64-NEXT:  .LBB190_2:
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v16, v25, a2
+; RV64-NEXT:    vmv1r.v v0, v16
+; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
@@ -2277,31 +2351,58 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 }
 
 define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_swap_nxv32i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    srli a2, a3, 2
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a4, a1, a3
-; CHECK-NEXT:    sltu a5, a1, a4
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT:    bltu a1, a3, .LBB191_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB191_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v25, a2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    ret
+; RV32-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    srli a2, a3, 2
+; RV32-NEXT:    slli a4, a3, 1
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
+; RV32-NEXT:    sub a5, a1, a4
+; RV32-NEXT:    sltu a6, a4, a1
+; RV32-NEXT:    addi a6, a6, -1
+; RV32-NEXT:    and a5, a6, a5
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT:    bltu a1, a4, .LBB191_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a1, a4
+; RV32-NEXT:  .LBB191_2:
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v16, v25, a2
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v24, v0
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    srli a2, a3, 2
+; RV64-NEXT:    slli a3, a3, 1
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
+; RV64-NEXT:    sub a4, a1, a3
+; RV64-NEXT:    sltu a5, a3, a1
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    and a4, a5, a4
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT:    bltu a1, a3, .LBB191_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a1, a3
+; RV64-NEXT:  .LBB191_2:
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v16, v25, a2
+; RV64-NEXT:    vmv1r.v v0, v16
+; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %vb, <vscale x 32 x i32> %va, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 6381887a1a2f9..3d34a619ce8bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -595,7 +595,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV32-NEXT:    csrr a4, vlenb
 ; CHECK-RV32-NEXT:    sub a2, a3, a4
-; CHECK-RV32-NEXT:    sltu a5, a3, a2
+; CHECK-RV32-NEXT:    sltu a5, a4, a3
 ; CHECK-RV32-NEXT:    addi a5, a5, -1
 ; CHECK-RV32-NEXT:    and a2, a5, a2
 ; CHECK-RV32-NEXT:    bltu a3, a4, .LBB55_2
@@ -621,7 +621,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    sub a3, a2, a4
-; CHECK-RV64-NEXT:    sltu a5, a2, a3
+; CHECK-RV64-NEXT:    sltu a5, a4, a2
 ; CHECK-RV64-NEXT:    addi a5, a5, -1
 ; CHECK-RV64-NEXT:    and a3, a5, a3
 ; CHECK-RV64-NEXT:    bltu a2, a4, .LBB55_2
@@ -647,19 +647,19 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 %stride, i32 zeroext %evl) {
 ; CHECK-RV32-LABEL: strided_load_nxv16f64_allones_mask:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    csrr a4, vlenb
-; CHECK-RV32-NEXT:    sub a2, a3, a4
-; CHECK-RV32-NEXT:    sltu a5, a3, a2
+; CHECK-RV32-NEXT:    csrr a2, vlenb
+; CHECK-RV32-NEXT:    sub a4, a3, a2
+; CHECK-RV32-NEXT:    sltu a5, a2, a3
 ; CHECK-RV32-NEXT:    addi a5, a5, -1
-; CHECK-RV32-NEXT:    and a2, a5, a2
-; CHECK-RV32-NEXT:    bltu a3, a4, .LBB56_2
+; CHECK-RV32-NEXT:    and a4, a5, a4
+; CHECK-RV32-NEXT:    bltu a3, a2, .LBB56_2
 ; CHECK-RV32-NEXT:  # %bb.1:
-; CHECK-RV32-NEXT:    mv a3, a4
+; CHECK-RV32-NEXT:    mv a3, a2
 ; CHECK-RV32-NEXT:  .LBB56_2:
-; CHECK-RV32-NEXT:    mul a4, a3, a1
-; CHECK-RV32-NEXT:    add a4, a0, a4
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v16, (a4), a1
+; CHECK-RV32-NEXT:    mul a2, a3, a1
+; CHECK-RV32-NEXT:    add a2, a0, a2
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v16, (a2), a1
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v8, (a0), a1
 ; CHECK-RV32-NEXT:    ret
@@ -668,7 +668,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    sub a3, a2, a4
-; CHECK-RV64-NEXT:    sltu a5, a2, a3
+; CHECK-RV64-NEXT:    sltu a5, a4, a2
 ; CHECK-RV64-NEXT:    addi a5, a5, -1
 ; CHECK-RV64-NEXT:    and a3, a5, a3
 ; CHECK-RV64-NEXT:    bltu a2, a4, .LBB56_2
@@ -703,7 +703,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    mv a6, a7
 ; CHECK-RV32-NEXT:  .LBB57_2:
 ; CHECK-RV32-NEXT:    sub a5, a6, a2
-; CHECK-RV32-NEXT:    sltu t0, a6, a5
+; CHECK-RV32-NEXT:    sltu t0, a2, a6
 ; CHECK-RV32-NEXT:    addi t0, t0, -1
 ; CHECK-RV32-NEXT:    and t0, t0, a5
 ; CHECK-RV32-NEXT:    mv a5, a6
@@ -713,15 +713,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:  .LBB57_4:
 ; CHECK-RV32-NEXT:    mul t1, a5, a1
 ; CHECK-RV32-NEXT:    srli t2, a2, 3
-; CHECK-RV32-NEXT:    sub a7, a3, a7
 ; CHECK-RV32-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vx v0, v8, t2
+; CHECK-RV32-NEXT:    sub t2, a3, a7
 ; CHECK-RV32-NEXT:    add t1, a0, t1
 ; CHECK-RV32-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV32-NEXT:    sltu a3, a3, a7
+; CHECK-RV32-NEXT:    sltu a3, a7, a3
 ; CHECK-RV32-NEXT:    addi a3, a3, -1
-; CHECK-RV32-NEXT:    and a3, a3, a7
+; CHECK-RV32-NEXT:    and a3, a3, t2
 ; CHECK-RV32-NEXT:    bltu a3, a2, .LBB57_6
 ; CHECK-RV32-NEXT:  # %bb.5:
 ; CHECK-RV32-NEXT:    mv a3, a2
@@ -751,7 +751,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    mv a6, a7
 ; CHECK-RV64-NEXT:  .LBB57_2:
 ; CHECK-RV64-NEXT:    sub a5, a6, a4
-; CHECK-RV64-NEXT:    sltu t0, a6, a5
+; CHECK-RV64-NEXT:    sltu t0, a4, a6
 ; CHECK-RV64-NEXT:    addi t0, t0, -1
 ; CHECK-RV64-NEXT:    and t0, t0, a5
 ; CHECK-RV64-NEXT:    mv a5, a6
@@ -761,15 +761,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:  .LBB57_4:
 ; CHECK-RV64-NEXT:    mul t1, a5, a1
 ; CHECK-RV64-NEXT:    srli t2, a4, 3
-; CHECK-RV64-NEXT:    sub a7, a2, a7
 ; CHECK-RV64-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vx v0, v8, t2
+; CHECK-RV64-NEXT:    sub t2, a2, a7
 ; CHECK-RV64-NEXT:    add t1, a0, t1
 ; CHECK-RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV64-NEXT:    sltu a2, a2, a7
+; CHECK-RV64-NEXT:    sltu a2, a7, a2
 ; CHECK-RV64-NEXT:    addi a2, a2, -1
-; CHECK-RV64-NEXT:    and a2, a2, a7
+; CHECK-RV64-NEXT:    and a2, a2, t2
 ; CHECK-RV64-NEXT:    bltu a2, a4, .LBB57_6
 ; CHECK-RV64-NEXT:  # %bb.5:
 ; CHECK-RV64-NEXT:    mv a2, a4
@@ -861,10 +861,10 @@ define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, p
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    srli a2, a1, 3
-; CHECK-RV32-NEXT:    sub a1, a2, a1
-; CHECK-RV32-NEXT:    sltu a3, a2, a1
-; CHECK-RV32-NEXT:    addi a3, a3, -1
-; CHECK-RV32-NEXT:    and a1, a3, a1
+; CHECK-RV32-NEXT:    sub a3, a2, a1
+; CHECK-RV32-NEXT:    sltu a1, a1, a2
+; CHECK-RV32-NEXT:    addi a1, a1, -1
+; CHECK-RV32-NEXT:    and a1, a1, a3
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v24, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 2ec89888af077..12ff5e98c00e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -492,12 +492,12 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
 ; CHECK-NEXT:    sub a5, a2, a3
+; CHECK-NEXT:    sltu a2, a3, a2
 ; CHECK-NEXT:    mul a4, a4, a1
 ; CHECK-NEXT:    srli a3, a3, 3
-; CHECK-NEXT:    sltu a2, a2, a5
+; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    add a0, a0, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -508,25 +508,45 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 }
 
 define void @strided_store_nxv16f64_allones_mask(<vscale x 16 x double> %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
-; CHECK-LABEL: strided_store_nxv16f64_allones_mask:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB47_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a3, a4
-; CHECK-NEXT:  .LBB47_2:
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v8, (a0), a1
-; CHECK-NEXT:    sub a4, a2, a4
-; CHECK-NEXT:    mul a3, a3, a1
-; CHECK-NEXT:    sltu a2, a2, a4
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a4
-; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v16, (a0), a1
-; CHECK-NEXT:    ret
+; CHECK-RV32-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    csrr a3, vlenb
+; CHECK-RV32-NEXT:    mv a4, a2
+; CHECK-RV32-NEXT:    bltu a2, a3, .LBB47_2
+; CHECK-RV32-NEXT:  # %bb.1:
+; CHECK-RV32-NEXT:    mv a4, a3
+; CHECK-RV32-NEXT:  .LBB47_2:
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-RV32-NEXT:    sub a5, a2, a3
+; CHECK-RV32-NEXT:    sltu a2, a3, a2
+; CHECK-RV32-NEXT:    mul a3, a4, a1
+; CHECK-RV32-NEXT:    addi a2, a2, -1
+; CHECK-RV32-NEXT:    and a2, a2, a5
+; CHECK-RV32-NEXT:    add a0, a0, a3
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vsse64.v v16, (a0), a1
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    csrr a4, vlenb
+; CHECK-RV64-NEXT:    mv a3, a2
+; CHECK-RV64-NEXT:    bltu a2, a4, .LBB47_2
+; CHECK-RV64-NEXT:  # %bb.1:
+; CHECK-RV64-NEXT:    mv a3, a4
+; CHECK-RV64-NEXT:  .LBB47_2:
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-RV64-NEXT:    sub a5, a2, a4
+; CHECK-RV64-NEXT:    sltu a2, a4, a2
+; CHECK-RV64-NEXT:    mul a3, a3, a1
+; CHECK-RV64-NEXT:    addi a2, a2, -1
+; CHECK-RV64-NEXT:    and a2, a2, a5
+; CHECK-RV64-NEXT:    add a0, a0, a3
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vsse64.v v16, (a0), a1
+; CHECK-RV64-NEXT:    ret
   call void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32(<vscale x 16 x double> %v, ptr %ptr, i32 %stride, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret void
 }
@@ -554,19 +574,19 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a1), a2, v0.t
 ; CHECK-NEXT:    sub a0, a5, a4
-; CHECK-NEXT:    mul a7, a7, a2
-; CHECK-NEXT:    srli t0, a4, 3
-; CHECK-NEXT:    sub a6, a3, a6
+; CHECK-NEXT:    sub t0, a3, a6
+; CHECK-NEXT:    sltu a3, a6, a3
+; CHECK-NEXT:    srli a6, a4, 3
 ; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, t0
-; CHECK-NEXT:    sltu t0, a5, a0
+; CHECK-NEXT:    vslidedown.vx v0, v7, a6
+; CHECK-NEXT:    sltu a6, a4, a5
+; CHECK-NEXT:    mul a7, a7, a2
+; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    add a7, a1, a7
-; CHECK-NEXT:    sltu a3, a3, a6
-; CHECK-NEXT:    addi t0, t0, -1
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and t0, t0, a0
-; CHECK-NEXT:    and a0, a3, a6
-; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT:    and a6, a6, a0
+; CHECK-NEXT:    and a0, a3, t0
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a7), a2, v0.t
 ; CHECK-NEXT:    bltu a0, a4, .LBB48_6
 ; CHECK-NEXT:  # %bb.5:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index c64b755051898..6378135654ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -551,7 +551,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -1306,7 +1306,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1329,7 +1329,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1354,11 +1354,11 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
 ; RV32-NEXT:    srli a1, a0, 2
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vadd.vi v8, v8, -1, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a2, a0, 1
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
-; RV32-NEXT:    slli a1, a0, 1
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    sub a1, a0, a2
+; RV32-NEXT:    sltu a0, a2, a0
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1374,7 +1374,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
 ; RV64-NEXT:    slli a1, a0, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
+; RV64-NEXT:    sltu a3, a1, a0
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index e0fcd4009ad2e..7d97f353a22b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -847,7 +847,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -869,7 +869,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index d03b068e11ea8..42b1da9d97f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -928,13 +928,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; ZVFH-NEXT:    slli a1, a2, 1
 ; ZVFH-NEXT:    srli a2, a2, 2
 ; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    sltu a4, a1, a0
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    addi a4, a4, -1
+; ZVFH-NEXT:    and a3, a4, a3
+; ZVFH-NEXT:    addi a2, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -977,13 +977,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1026,13 +1026,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    sltu a4, a1, a0
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1079,14 +1079,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFH-NEXT:    slli a1, a2, 1
 ; ZVFH-NEXT:    srli a2, a2, 2
 ; ZVFH-NEXT:    sub a3, a0, a1
-; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    sltu a4, a1, a0
+; ZVFH-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    addi a4, a4, -1
+; ZVFH-NEXT:    and a3, a4, a3
+; ZVFH-NEXT:    addi a2, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1128,14 +1128,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1177,14 +1177,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1216,130 +1216,6 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 4
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    fmv.x.h a1, fa0
-; ZVFH-NEXT:    csrr a2, vlenb
-; ZVFH-NEXT:    vmv.v.x v24, a1
-; ZVFH-NEXT:    slli a1, a2, 1
-; ZVFH-NEXT:    srli a2, a2, 2
-; ZVFH-NEXT:    sub a3, a0, a1
-; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    csrr a3, vlenb
-; ZVFH-NEXT:    slli a3, a3, 3
-; ZVFH-NEXT:    add a3, sp, a3
-; ZVFH-NEXT:    addi a3, a3, 16
-; ZVFH-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT:    bltu a0, a1, .LBB24_2
-; ZVFH-NEXT:  # %bb.1:
-; ZVFH-NEXT:    mv a0, a1
-; ZVFH-NEXT:  .LBB24_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB24_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB24_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -1355,14 +1231,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8alt, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1397,108 +1273,6 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT:    fmv.x.h a1, fa0
-; ZVFH-NEXT:    csrr a2, vlenb
-; ZVFH-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFH-NEXT:    vmset.m v24
-; ZVFH-NEXT:    vmv.v.x v16, a1
-; ZVFH-NEXT:    slli a1, a2, 1
-; ZVFH-NEXT:    srli a2, a2, 2
-; ZVFH-NEXT:    sub a3, a0, a1
-; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT:    bltu a0, a1, .LBB25_2
-; ZVFH-NEXT:  # %bb.1:
-; ZVFH-NEXT:    mv a0, a1
-; ZVFH-NEXT:  .LBB25_2:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v16, v24
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB25_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB25_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -1514,14 +1288,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8alt, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2351,13 +2125,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2400,13 +2174,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    sltu a4, a1, a0
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2459,14 +2233,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2508,14 +2282,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2553,68 +2327,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vfadd.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB50_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB50_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -2631,17 +2343,17 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    csrr a3, vlenb
-; ZVFBFA-NEXT:    slli a3, a3, 3
-; ZVFBFA-NEXT:    add a3, sp, a3
-; ZVFBFA-NEXT:    addi a3, a3, 16
-; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a2, a2, 3
+; ZVFBFA-NEXT:    add a2, sp, a2
+; ZVFBFA-NEXT:    addi a2, a2, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2689,57 +2401,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vfadd.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB51_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB51_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -2756,14 +2417,14 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index e9d7137919ac9..5f8603067d82a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 }
 
 define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB22_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB23_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfdiv.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfdiv.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB47_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index c25a0d47c5c53..03cbe8c5d555c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -600,16 +600,16 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
+; CHECK-NEXT:    sltu a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a1, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -716,17 +716,17 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a1, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -815,7 +815,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    csrr a4, vlenb
@@ -912,124 +912,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 }
 
 define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v3, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v4, v8, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB33_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB33_2:
-; CHECK-NEXT:    vmv1r.v v0, v3
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vmv4r.v v12, v4
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -1058,7 +940,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a3
-; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    csrr a4, vlenb
@@ -1161,107 +1043,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 }
 
 define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v8
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v8, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB35_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB35_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v0
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -2049,16 +1830,16 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
+; ZVFHMIN-NEXT:    sltu a4, a0, a1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a1, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2172,17 +1953,17 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a1, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2277,7 +2058,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -2378,153 +2159,34 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16, v0.t
+; ZVFH-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_commute:
+; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB69_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB69_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
@@ -2532,7 +2194,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -2640,108 +2302,6 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB71_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB71_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -3428,14 +2988,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    slli a5, a1, 3
 ; CHECK-NEXT:    sub a6, a4, a1
 ; CHECK-NEXT:    add a7, a2, a5
-; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a7)
 ; CHECK-NEXT:    csrr a7, vlenb
 ; CHECK-NEXT:    slli a7, a7, 3
 ; CHECK-NEXT:    add a7, sp, a7
 ; CHECK-NEXT:    addi a7, a7, 16
 ; CHECK-NEXT:    vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    sltu a7, a1, a4
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
@@ -3563,7 +3123,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a4, a5
+; CHECK-NEXT:    sltu a3, a1, a4
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
@@ -7976,35 +7536,36 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT:    sltu a3, a0, a1
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a2, v0.t
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -8013,37 +7574,37 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB280_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB280_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -8052,17 +7613,17 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v16, v8
 ; ZVFHMIN-NEXT:    vmv4r.v v12, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -8114,10 +7675,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    sltu a3, a0, a1
+; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -8229,7 +7790,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -8338,128 +7899,6 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmsub.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB283_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB283_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -8498,7 +7937,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -8606,9 +8045,75 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmsub.vf v8, fa0, v16
+; ZVFH-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT:    vmv.v.v v8, v16
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
@@ -8616,30 +8121,31 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v3, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -8654,7 +8160,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -8662,27 +8168,39 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB285_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB290_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB285_2:
+; ZVFHMIN-NEXT:  .LBB290_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -8690,1046 +8208,21 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT:    vmv.v.v v8, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 4
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB286_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB286_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB287_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB287_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB288_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB288_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB289_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB289_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB290_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB290_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB291_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB291_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB292_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB292_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB293_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB293_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    vmv4r.v v12, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -9739,20 +8232,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
@@ -9760,30 +8253,31 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v3, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -9814,10 +8308,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB294_2
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB291_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB294_2:
+; ZVFHMIN-NEXT:  .LBB291_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -9870,249 +8364,134 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB295_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB295_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB296_2
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB292_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB296_2:
+; ZVFHMIN-NEXT:  .LBB292_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10120,20 +8499,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
@@ -10141,80 +8520,74 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB297_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB293_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB297_2:
+; ZVFHMIN-NEXT:  .LBB293_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10222,15 +8595,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10240,79 +8618,66 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT:    vmv.v.v v8, v16
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 4
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10320,30 +8685,39 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB298_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB294_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB298_2:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB294_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
@@ -10353,27 +8727,19 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
 ; ZVFHMIN-NEXT:    vmv4r.v v12, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10381,68 +8747,68 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10450,38 +8816,38 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB299_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB295_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB299_2:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB295_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -10490,15 +8856,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v16, v8
@@ -10512,68 +8878,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10589,16 +8956,16 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB300_2
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB296_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB300_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB296_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
@@ -10607,7 +8974,7 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10615,14 +8982,14 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10630,68 +8997,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10699,33 +9067,33 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB301_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB297_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB301_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB297_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10733,14 +9101,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10748,6 +9117,61 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT:    vmv.v.v v8, v16
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
   %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -10781,7 +9205,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -10912,7 +9336,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -11038,7 +9462,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -11168,7 +9592,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -11296,11 +9720,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -11425,11 +9849,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -11560,11 +9984,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
@@ -11679,11 +10103,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 394887fee67fc..803680dd09061 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index 5c5542619b6ef..43b62bb7f9f76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index eb77b4b4dbac3..39f0163de048c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -489,13 +489,13 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -548,14 +548,14 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -592,68 +592,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmul.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -666,57 +604,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmul.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 03de2c97e685c..37ee3ad000854 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1096,14 +1096,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    slli a5, a1, 3
 ; CHECK-NEXT:    sub a6, a4, a1
 ; CHECK-NEXT:    add a7, a2, a5
-; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a7)
 ; CHECK-NEXT:    csrr a7, vlenb
 ; CHECK-NEXT:    slli a7, a7, 3
 ; CHECK-NEXT:    add a7, sp, a7
 ; CHECK-NEXT:    addi a7, a7, 16
 ; CHECK-NEXT:    vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    sltu a7, a1, a4
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
@@ -1217,7 +1217,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a4, a5
+; CHECK-NEXT:    sltu a3, a1, a4
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index 96fbe3f6ff025..a78fea1ef3110 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -799,7 +799,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -821,7 +821,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 458795db7965d..c759f2b48f53f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -93,7 +93,7 @@ define <vscale x 32 x float> @vfpext_nxv32f16_nxv32f32(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 7127d10e67dbc..5a0e0e8004af8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 07b58ed057508..03c5f7eed3fc0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 4177672b3a306..0f78e035e39d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -92,7 +92,7 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -135,11 +135,11 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    slli a3, a1, 1
 ; CHECK-NEXT:    add a6, a0, a4
 ; CHECK-NEXT:    sub a0, a2, a3
-; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    sltu a4, a3, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    sltu a7, a0, a4
+; CHECK-NEXT:    sltu a7, a1, a0
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    and a4, a7, a4
 ; CHECK-NEXT:    srli a7, a1, 2
@@ -162,7 +162,7 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB8_4:
 ; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 451b13edb794e..a77b8a6905f71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -161,7 +161,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    and a3, a4, a3
@@ -196,7 +196,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v16, a2
@@ -437,7 +437,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    and a3, a4, a3
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
@@ -715,7 +715,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <v
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -737,7 +737,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64_unmasked(<vscale x 16 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 6637aced3cdac..ce30d9257cb02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 }
 
 define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB22_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB23_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfsub.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfsub.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB47_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 3cf464247250a..df4b731015243 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmax.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index e755d099df4a8..9b5e83f94e5fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmaxu.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 961f63cbfbc95..1816b07c49c6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmin.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 631799d24e14c..608790009bdb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vminu.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index c96a7d774a5d5..65d37bfb31916 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -700,17 +700,17 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
 ; CHECK-NEXT:    addi a3, sp, 64
 ; CHECK-NEXT:    li a4, -1
 ; CHECK-NEXT:    sub a5, a0, a2
-; CHECK-NEXT:    add a6, a0, a3
-; CHECK-NEXT:    sltu a0, a0, a5
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    sltu a6, a2, a0
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsse8.v v8, (a6), a4
-; CHECK-NEXT:    sub a6, a6, a1
-; CHECK-NEXT:    and a0, a0, a5
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsse8.v v16, (a6), a4
+; CHECK-NEXT:    vsse8.v v8, (a0), a4
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a5, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v16, (a0), a4
 ; CHECK-NEXT:    vle8.v v16, (a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
index b8b2ba7c5e5d3..aeee1fa8215f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
@@ -727,7 +727,7 @@ define <vscale x 32 x i32> @vp_splat_nxv32i32(i32 %val, <vscale x 32 x i1> %m, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index b83ddce61f44d..3d025a29e6725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -470,61 +470,61 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
 ; CHECK-LABEL: test_vp_splice_nxv16i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a5, a4, 1
+; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    slli a1, a4, 3
-; CHECK-NEXT:    slli a7, a4, 1
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    mv a6, a2
-; CHECK-NEXT:    bltu a2, a7, .LBB22_2
+; CHECK-NEXT:    mv a7, a2
+; CHECK-NEXT:    bltu a2, a5, .LBB22_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a6, a7
+; CHECK-NEXT:    mv a7, a5
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    addi sp, sp, -80
 ; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    addi s0, sp, 80
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 5
-; CHECK-NEXT:    sub sp, sp, a7
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 5
+; CHECK-NEXT:    sub sp, sp, a5
 ; CHECK-NEXT:    andi sp, sp, -64
-; CHECK-NEXT:    vl8re64.v v24, (a5)
-; CHECK-NEXT:    slli a5, a6, 3
+; CHECK-NEXT:    add a5, a0, a1
+; CHECK-NEXT:    slli a7, a7, 3
 ; CHECK-NEXT:    addi a6, sp, 64
-; CHECK-NEXT:    add a5, a6, a5
-; CHECK-NEXT:    mv a7, a2
+; CHECK-NEXT:    mv t0, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB22_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a7, a4
+; CHECK-NEXT:    mv t0, a4
 ; CHECK-NEXT:  .LBB22_4:
+; CHECK-NEXT:    vl8re64.v v24, (a5)
+; CHECK-NEXT:    add a5, a6, a7
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a6)
 ; CHECK-NEXT:    sub a0, a2, a4
-; CHECK-NEXT:    add a6, a6, a1
-; CHECK-NEXT:    sub a7, a3, a4
-; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    sltu a2, a4, a2
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a0
-; CHECK-NEXT:    sltu a0, a3, a7
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a7
-; CHECK-NEXT:    add a7, a5, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a6)
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    add a6, a6, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v24, (a7)
+; CHECK-NEXT:    vse64.v v16, (a6)
+; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:    bltu a3, a4, .LBB22_6
 ; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    mv a3, a4
+; CHECK-NEXT:    mv a0, a4
 ; CHECK-NEXT:  .LBB22_6:
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v0, (a5)
-; CHECK-NEXT:    addi a2, sp, 104
-; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v0, (a5)
+; CHECK-NEXT:    sub a2, a3, a4
+; CHECK-NEXT:    sltu a3, a4, a3
+; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    addi a4, sp, 104
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    add a1, a4, a1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v24, (a5)
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a4)
 ; CHECK-NEXT:    addi sp, s0, -80
 ; CHECK-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -537,66 +537,66 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
 define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 {
 ; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a1, a4, 3
-; CHECK-NEXT:    slli a7, a4, 1
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    mv a6, a2
-; CHECK-NEXT:    bltu a2, a7, .LBB23_2
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a6, a5, 1
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    slli a1, a5, 3
+; CHECK-NEXT:    mv a4, a2
+; CHECK-NEXT:    bltu a2, a6, .LBB23_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a6, a7
+; CHECK-NEXT:    mv a4, a6
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    addi sp, sp, -80
 ; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    addi s0, sp, 80
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 5
-; CHECK-NEXT:    sub sp, sp, a7
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 5
+; CHECK-NEXT:    sub sp, sp, a6
 ; CHECK-NEXT:    andi sp, sp, -64
-; CHECK-NEXT:    vl8re64.v v24, (a5)
-; CHECK-NEXT:    slli a5, a6, 3
+; CHECK-NEXT:    add a6, a0, a1
+; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    addi a7, sp, 64
-; CHECK-NEXT:    add a6, a7, a5
 ; CHECK-NEXT:    mv t0, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB23_4
+; CHECK-NEXT:    bltu a2, a5, .LBB23_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv t0, a4
+; CHECK-NEXT:    mv t0, a5
 ; CHECK-NEXT:  .LBB23_4:
+; CHECK-NEXT:    vl8re64.v v24, (a6)
+; CHECK-NEXT:    add a6, a7, a4
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a7)
-; CHECK-NEXT:    sub a0, a2, a4
-; CHECK-NEXT:    add a7, a7, a1
-; CHECK-NEXT:    sub t0, a3, a4
-; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    sub a0, a2, a5
+; CHECK-NEXT:    sltu a2, a5, a2
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a0
-; CHECK-NEXT:    sltu a0, a3, t0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, t0
-; CHECK-NEXT:    add t0, a6, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a7)
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    add a7, a7, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v24, (t0)
-; CHECK-NEXT:    bltu a3, a4, .LBB23_6
+; CHECK-NEXT:    vse64.v v16, (a7)
+; CHECK-NEXT:    mv a0, a3
+; CHECK-NEXT:    bltu a3, a5, .LBB23_6
 ; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    mv a3, a4
+; CHECK-NEXT:    mv a0, a5
 ; CHECK-NEXT:  .LBB23_6:
-; CHECK-NEXT:    li a2, 8
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v0, (a6)
-; CHECK-NEXT:    bltu a5, a2, .LBB23_8
+; CHECK-NEXT:    sub a2, a3, a5
+; CHECK-NEXT:    sltu a3, a5, a3
+; CHECK-NEXT:    add a5, a6, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    li a3, 8
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v24, (a5)
+; CHECK-NEXT:    bltu a4, a3, .LBB23_8
 ; CHECK-NEXT:  # %bb.7:
-; CHECK-NEXT:    li a5, 8
+; CHECK-NEXT:    li a4, 8
 ; CHECK-NEXT:  .LBB23_8:
-; CHECK-NEXT:    sub a2, a6, a5
+; CHECK-NEXT:    sub a2, a6, a4
 ; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    addi sp, s0, -80
 ; CHECK-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index a075bba81d3c6..fb8480ee5f471 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -254,7 +254,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV32-NEXT:    slli a2, a3, 1
 ; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    sub a4, a1, a2
-; RV32-NEXT:    sltu a5, a1, a4
+; RV32-NEXT:    sltu a5, a2, a1
 ; RV32-NEXT:    addi a5, a5, -1
 ; RV32-NEXT:    and a4, a5, a4
 ; RV32-NEXT:    vslidedown.vx v0, v0, a3
@@ -281,12 +281,12 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV64-NEXT:    slli a3, a2, 1
 ; RV64-NEXT:    srli a4, a2, 2
 ; RV64-NEXT:    sub a5, a1, a3
+; RV64-NEXT:    sltu a6, a3, a1
 ; RV64-NEXT:    vslidedown.vx v13, v0, a4
-; RV64-NEXT:    sltu a4, a1, a5
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a5, a4, a5
+; RV64-NEXT:    addi a6, a6, -1
+; RV64-NEXT:    and a5, a6, a5
 ; RV64-NEXT:    sub a4, a5, a2
-; RV64-NEXT:    sltu a6, a5, a4
+; RV64-NEXT:    sltu a6, a2, a5
 ; RV64-NEXT:    addi a6, a6, -1
 ; RV64-NEXT:    and a6, a6, a4
 ; RV64-NEXT:    srli a4, a2, 3
@@ -310,7 +310,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV64-NEXT:    mv a1, a3
 ; RV64-NEXT:  .LBB12_4:
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a5, a1, a3
+; RV64-NEXT:    sltu a5, a2, a1
 ; RV64-NEXT:    addi a5, a5, -1
 ; RV64-NEXT:    and a3, a5, a3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
@@ -2367,7 +2367,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    srli a3, a1, 3
 ; RV32-NEXT:    vslidedown.vx v0, v0, a3
-; RV32-NEXT:    sltu a3, a0, a2
+; RV32-NEXT:    sltu a3, a1, a0
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2390,7 +2390,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV64-NEXT:    sub a2, a0, a1
 ; RV64-NEXT:    srli a3, a1, 3
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
-; RV64-NEXT:    sltu a3, a0, a2
+; RV64-NEXT:    sltu a3, a1, a0
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2422,8 +2422,8 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    srli a2, a2, 3
-; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
@@ -2443,7 +2443,7 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a4
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2479,8 +2479,8 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    srli a2, a2, 3
-; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
@@ -2500,7 +2500,7 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a4
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2537,8 +2537,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    srli a2, a2, 3
-; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
@@ -2561,8 +2561,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a1, a2
+; RV64-NEXT:    sltu a1, a2, a1
 ; RV64-NEXT:    srli a2, a2, 3
-; RV64-NEXT:    sltu a1, a1, a3
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index 2ece316c7e54a..4d2ba719d63ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -456,15 +456,15 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
 ; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    srli a5, a2, 3
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    sltu a5, a1, a3
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a3, a5, a3
-; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    slli a5, a2, 3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a4), v0.t
+; CHECK-NEXT:    vle64.v v16, (a5), v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
@@ -496,18 +496,18 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    mv a4, a5
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    sub a6, a4, a3
-; CHECK-NEXT:    slli a7, a3, 3
-; CHECK-NEXT:    sltu t0, a4, a6
-; CHECK-NEXT:    addi t0, t0, -1
-; CHECK-NEXT:    and a6, t0, a6
-; CHECK-NEXT:    srli t0, a3, 3
-; CHECK-NEXT:    sub t1, a2, a5
-; CHECK-NEXT:    add a5, a0, a7
-; CHECK-NEXT:    sltu a2, a2, t1
+; CHECK-NEXT:    sltu a7, a3, a4
+; CHECK-NEXT:    sub t0, a2, a5
+; CHECK-NEXT:    sltu a2, a5, a2
+; CHECK-NEXT:    slli a5, a3, 3
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a6, a7, a6
+; CHECK-NEXT:    srli a7, a3, 3
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, t1
-; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, t0
+; CHECK-NEXT:    and a2, a2, t0
+; CHECK-NEXT:    vsetvli t0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a7
 ; CHECK-NEXT:    bltu a2, a3, .LBB45_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index f92ee37051840..01edd0f912bd6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -198,22 +198,22 @@ define <vscale x 128 x i1> @vpmerge_nxv128i1(<vscale x 128 x i1> %va, <vscale x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB7_2:
-; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT:    sub a3, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    sub a2, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
-; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmv1r.v v0, v4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
-; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v24, v8, 0
-; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
@@ -547,7 +547,7 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    sub a4, a3, a1
 ; CHECK-NEXT:    vl8r.v v24, (a2)
-; CHECK-NEXT:    sltu a2, a3, a4
+; CHECK-NEXT:    sltu a2, a1, a3
 ; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
@@ -583,7 +583,7 @@ define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
@@ -611,7 +611,7 @@ define <vscale x 128 x i8> @vpmerge_vi_nxv128i8(<vscale x 128 x i8> %vb, <vscale
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 7e4a60095d7cc..153a0a70d098a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -2193,8 +2193,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
 ; RV32-NEXT:    sub a2, a1, a0
+; RV32-NEXT:    sltu a1, a0, a1
 ; RV32-NEXT:    srli a0, a0, 3
-; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a0
@@ -2226,8 +2226,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    sub a0, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a0
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
@@ -2263,8 +2263,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
 ; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
@@ -2298,8 +2298,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
@@ -2336,8 +2336,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
 ; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
@@ -2371,8 +2371,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
@@ -2410,8 +2410,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
 ; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
@@ -2435,8 +2435,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 9fd8b9d23cb5e..3468fda9011a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -373,8 +373,8 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
+; CHECK-NEXT:    sltu a1, a2, a1
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sltu a1, a1, a3
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a3
 ; CHECK-NEXT:    add a0, a0, a2
@@ -409,20 +409,20 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1), v0.t
 ; CHECK-NEXT:    sub a0, a5, a3
-; CHECK-NEXT:    srli a6, a3, 3
+; CHECK-NEXT:    sltu a5, a3, a5
+; CHECK-NEXT:    sub a6, a2, a4
+; CHECK-NEXT:    sltu a2, a4, a2
+; CHECK-NEXT:    srli a4, a3, 3
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a6
-; CHECK-NEXT:    slli a6, a3, 3
-; CHECK-NEXT:    sub a4, a2, a4
-; CHECK-NEXT:    sltu a5, a5, a0
-; CHECK-NEXT:    add a6, a1, a6
-; CHECK-NEXT:    sltu a2, a2, a4
+; CHECK-NEXT:    vslidedown.vx v0, v7, a4
+; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    add a4, a1, a4
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a5, a5, a0
-; CHECK-NEXT:    and a0, a2, a4
+; CHECK-NEXT:    and a0, a2, a6
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a6), v0.t
+; CHECK-NEXT:    vse64.v v16, (a4), v0.t
 ; CHECK-NEXT:    bltu a0, a3, .LBB36_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index df97f19df7f99..4f31167b80691 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -91,7 +91,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a1
 ; CHECK-NEXT:    sub a1, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a1, a3, a1
 ; CHECK-NEXT:    bltu a0, a2, .LBB6_2
@@ -120,7 +120,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a1
 ; CHECK-NEXT:    sub a1, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a1, a3, a1
 ; CHECK-NEXT:    bltu a0, a2, .LBB7_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 7eea35afe0aa0..f2b84c28db92e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -884,7 +884,7 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a2
 ; CHECK-NEXT:    sub a2, a1, a3
-; CHECK-NEXT:    sltu a4, a1, a2
+; CHECK-NEXT:    sltu a4, a3, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a2, a4, a2
 ; CHECK-NEXT:    bltu a1, a3, .LBB67_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
index 1e629e9d20530..535a5bdb839e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
@@ -318,7 +318,7 @@ define zeroext i1 @vpreduce_or_nxv128i1(i1 zeroext %s, <vscale x 128 x i1> %v, <
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index 98634fe55de41..b4ed1857652f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -557,7 +557,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -580,7 +580,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -1312,7 +1312,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1335,7 +1335,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index a7d304261f87f..d761b8da7929c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -556,7 +556,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -579,7 +579,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -1311,7 +1311,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1334,7 +1334,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index d1933560f2698..e6ef1bcf73a3d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -308,12 +308,12 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    slli a1, a3, 1
-; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    sub a5, a2, a1
 ; CHECK-NEXT:    vl8re32.v v24, (a4)
-; CHECK-NEXT:    sltu a4, a2, a5
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    vl8re32.v v8, (a0)
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    and a4, a4, a5
@@ -349,14 +349,14 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; RV32-NEXT:    slli a2, a1, 3
 ; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
 ; RV32-NEXT:    vl8re32.v v24, (a0)
-; RV32-NEXT:    sltu a0, a1, a2
-; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    sub a0, a1, a2
+; RV32-NEXT:    sltu a2, a2, a1
+; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; RV32-NEXT:    ret
@@ -376,16 +376,16 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a3, a1, 3
 ; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    srli a4, a1, 2
 ; RV64-NEXT:    add a3, a0, a3
-; RV64-NEXT:    sub a5, a1, a2
+; RV64-NEXT:    sub a4, a1, a2
+; RV64-NEXT:    sltu a5, a2, a1
 ; RV64-NEXT:    vl8re32.v v24, (a3)
-; RV64-NEXT:    sltu a3, a1, a5
-; RV64-NEXT:    addi a3, a3, -1
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    srli a3, a1, 2
 ; RV64-NEXT:    vl8re32.v v8, (a0)
-; RV64-NEXT:    vslidedown.vx v0, v0, a4
-; RV64-NEXT:    and a3, a3, a5
-; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a3
+; RV64-NEXT:    and a4, a5, a4
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; RV64-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; RV64-NEXT:    bltu a1, a2, .LBB28_2
 ; RV64-NEXT:  # %bb.1:
@@ -637,10 +637,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    sub a4, a2, a1
+; CHECK-NEXT:    sltu a5, a1, a2
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    sltu a5, a2, a4
-; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 07411b1c7ae08..c8bb009d2c3b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 7f96da141c363..90f1ca0843b02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFH-NEXT:    sub a2, a0, a1
-; ZVFH-NEXT:    sltu a3, a0, a2
+; ZVFH-NEXT:    sltu a3, a1, a0
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index 0ac2ef7e251c0..a6a631be9dab4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -598,7 +598,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1353,7 +1353,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1377,7 +1377,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index bde279a4d1f2b..1992b97e0de0d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -572,7 +572,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -596,7 +596,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1351,7 +1351,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1375,7 +1375,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index 0c1ca369521f7..0b07b60da8250 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -147,7 +147,7 @@ define <vscale x 15 x i16> @vtrunc_nxv15i16_nxv15i64(<vscale x 15 x i64> %a, <vs
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -201,7 +201,7 @@ define <vscale x 32 x i7> @vtrunc_nxv32i7_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -233,7 +233,7 @@ define <vscale x 32 x i8> @vtrunc_nxv32i8_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -280,11 +280,11 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    slli a3, a1, 1
 ; CHECK-NEXT:    add a6, a0, a4
 ; CHECK-NEXT:    sub a0, a2, a3
-; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    sltu a4, a3, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    sltu a7, a0, a4
+; CHECK-NEXT:    sltu a7, a1, a0
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    and a4, a7, a4
 ; CHECK-NEXT:    srli a7, a1, 2
@@ -307,7 +307,7 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB17_4:
 ; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index c0c749ebf3186..807c2d9fa3ce6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFH-NEXT:    sub a2, a0, a1
-; ZVFH-NEXT:    sltu a3, a0, a2
+; ZVFH-NEXT:    sltu a3, a1, a0
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 9713b617b8384..44a1084b4a208 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/usub_sat.ll b/llvm/test/CodeGen/RISCV/usub_sat.ll
index 33056682dcc79..6fcc6bc5f3dcd 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat.ll
@@ -7,10 +7,10 @@
 define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-LABEL: func:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func:
@@ -57,10 +57,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: func2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func2:
@@ -93,18 +93,18 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
 ; RV32I-LABEL: func16:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func16:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func16:
@@ -125,18 +125,18 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
 define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
 ; RV32I-LABEL: func8:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func8:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func8:
@@ -157,18 +157,18 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
 define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
 ; RV32I-LABEL: func3:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func3:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func3:
diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
index ef6bc022ddc9f..838f2dbe2276d 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -8,10 +8,10 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32I-LABEL: func32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mul a1, a1, a2
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func32:
@@ -65,7 +65,7 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64I-LABEL: func64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sub a1, a0, a2
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sltu a0, a2, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -106,10 +106,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32I-NEXT:    addi a3, a3, -1
 ; RV32I-NEXT:    and a0, a0, a3
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func16:
@@ -119,10 +119,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64I-NEXT:    addi a3, a3, -1
 ; RV64I-NEXT:    and a0, a0, a3
 ; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func16:
@@ -153,10 +153,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV32I-NEXT:    zext.b a0, a0
 ; RV32I-NEXT:    mul a1, a1, a2
 ; RV32I-NEXT:    zext.b a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func8:
@@ -164,10 +164,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64I-NEXT:    zext.b a0, a0
 ; RV64I-NEXT:    mul a1, a1, a2
 ; RV64I-NEXT:    zext.b a1, a1
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func8:
@@ -198,10 +198,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 15
 ; RV32I-NEXT:    mul a1, a1, a2
 ; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func4:
@@ -209,10 +209,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64I-NEXT:    andi a0, a0, 15
 ; RV64I-NEXT:    mul a1, a1, a2
 ; RV64I-NEXT:    andi a1, a1, 15
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func4:
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 62f08d7831dda..0de2cbd76b749 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -937,9 +937,10 @@ entry:
 define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32-LABEL: usubo.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    sw a1, 0(a2)
+; RV32-NEXT:    sltu a3, a1, a0
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: usubo.i32:
@@ -951,9 +952,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZBA-LABEL: usubo.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a0, a1
-; RV32ZBA-NEXT:    sw a1, 0(a2)
+; RV32ZBA-NEXT:    sltu a3, a1, a0
+; RV32ZBA-NEXT:    sub a0, a0, a1
+; RV32ZBA-NEXT:    sw a0, 0(a2)
+; RV32ZBA-NEXT:    mv a0, a3
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: usubo.i32:
@@ -965,9 +967,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZICOND-LABEL: usubo.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a0, a1
-; RV32ZICOND-NEXT:    sw a1, 0(a2)
+; RV32ZICOND-NEXT:    sltu a3, a1, a0
+; RV32ZICOND-NEXT:    sub a0, a0, a1
+; RV32ZICOND-NEXT:    sw a0, 0(a2)
+; RV32ZICOND-NEXT:    mv a0, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: usubo.i32:
@@ -987,9 +990,11 @@ entry:
 define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
 ; RV32-LABEL: usubo.i32.constant.rhs:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a2, a0, 2
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    sw a2, 0(a1)
+; RV32-NEXT:    addi a2, a0, 1
+; RV32-NEXT:    seqz a2, a2
+; RV32-NEXT:    addi a0, a0, 2
+; RV32-NEXT:    sw a0, 0(a1)
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: usubo.i32.constant.rhs:
@@ -1001,9 +1006,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
 ;
 ; RV32ZBA-LABEL: usubo.i32.constant.rhs:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    addi a2, a0, 2
-; RV32ZBA-NEXT:    sltu a0, a0, a2
-; RV32ZBA-NEXT:    sw a2, 0(a1)
+; RV32ZBA-NEXT:    addi a2, a0, 1
+; RV32ZBA-NEXT:    seqz a2, a2
+; RV32ZBA-NEXT:    addi a0, a0, 2
+; RV32ZBA-NEXT:    sw a0, 0(a1)
+; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: usubo.i32.constant.rhs:
@@ -1015,9 +1022,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
 ;
 ; RV32ZICOND-LABEL: usubo.i32.constant.rhs:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    addi a2, a0, 2
-; RV32ZICOND-NEXT:    sltu a0, a0, a2
-; RV32ZICOND-NEXT:    sw a2, 0(a1)
+; RV32ZICOND-NEXT:    addi a2, a0, 1
+; RV32ZICOND-NEXT:    seqz a2, a2
+; RV32ZICOND-NEXT:    addi a0, a0, 2
+; RV32ZICOND-NEXT:    sw a0, 0(a1)
+; RV32ZICOND-NEXT:    mv a0, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: usubo.i32.constant.rhs:
@@ -1039,8 +1048,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a2, -2
 ; RV32-NEXT:    sub a2, a2, a0
-; RV32-NEXT:    addi a0, a2, 1
-; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    sltiu a0, a0, -2
 ; RV32-NEXT:    sw a2, 0(a1)
 ; RV32-NEXT:    ret
 ;
@@ -1057,8 +1065,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a2, -2
 ; RV32ZBA-NEXT:    sub a2, a2, a0
-; RV32ZBA-NEXT:    addi a0, a2, 1
-; RV32ZBA-NEXT:    seqz a0, a0
+; RV32ZBA-NEXT:    sltiu a0, a0, -2
 ; RV32ZBA-NEXT:    sw a2, 0(a1)
 ; RV32ZBA-NEXT:    ret
 ;
@@ -1075,8 +1082,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    li a2, -2
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    addi a0, a2, 1
-; RV32ZICOND-NEXT:    seqz a0, a0
+; RV32ZICOND-NEXT:    sltiu a0, a0, -2
 ; RV32ZICOND-NEXT:    sw a2, 0(a1)
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -1116,9 +1122,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64-LABEL: usubo.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    sltu a3, a1, a0
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    mv a0, a3
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: usubo.i64:
@@ -1140,9 +1147,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZBA-LABEL: usubo.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
-; RV64ZBA-NEXT:    sd a1, 0(a2)
+; RV64ZBA-NEXT:    sltu a3, a1, a0
+; RV64ZBA-NEXT:    sub a0, a0, a1
+; RV64ZBA-NEXT:    sd a0, 0(a2)
+; RV64ZBA-NEXT:    mv a0, a3
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: usubo.i64:
@@ -1163,9 +1171,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZICOND-LABEL: usubo.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
-; RV64ZICOND-NEXT:    sd a1, 0(a2)
+; RV64ZICOND-NEXT:    sltu a3, a1, a0
+; RV64ZICOND-NEXT:    sub a0, a0, a1
+; RV64ZICOND-NEXT:    sd a0, 0(a2)
+; RV64ZICOND-NEXT:    mv a0, a3
 ; RV64ZICOND-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2810,8 +2819,7 @@ entry:
 define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: usubo.select.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    bltu a0, a2, .LBB40_2
+; RV32-NEXT:    bltu a1, a0, .LBB40_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB40_2: # %entry
@@ -2828,8 +2836,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: usubo.select.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a2, a0, a1
-; RV32ZBA-NEXT:    bltu a0, a2, .LBB40_2
+; RV32ZBA-NEXT:    bltu a1, a0, .LBB40_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:  .LBB40_2: # %entry
@@ -2846,8 +2853,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: usubo.select.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a2, a0, a1
-; RV32ZICOND-NEXT:    sltu a2, a0, a2
+; RV32ZICOND-NEXT:    sltu a2, a1, a0
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
@@ -2871,8 +2877,7 @@ entry:
 define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: usubo.not.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
@@ -2885,8 +2890,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: usubo.not.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2899,8 +2903,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: usubo.not.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -2940,8 +2943,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: usubo.select.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    bltu a0, a2, .LBB42_2
+; RV64-NEXT:    bltu a1, a0, .LBB42_2
 ; RV64-NEXT:  # %bb.1: # %entry
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:  .LBB42_2: # %entry
@@ -2969,8 +2971,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: usubo.select.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a2, a0, a1
-; RV64ZBA-NEXT:    bltu a0, a2, .LBB42_2
+; RV64ZBA-NEXT:    bltu a1, a0, .LBB42_2
 ; RV64ZBA-NEXT:  # %bb.1: # %entry
 ; RV64ZBA-NEXT:    mv a0, a1
 ; RV64ZBA-NEXT:  .LBB42_2: # %entry
@@ -2998,8 +2999,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: usubo.select.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a2, a0, a1
-; RV64ZICOND-NEXT:    sltu a2, a0, a2
+; RV64ZICOND-NEXT:    sltu a2, a1, a0
 ; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
 ; RV64ZICOND-NEXT:    or a0, a0, a1
@@ -3030,8 +3030,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: usubo.not.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
 ;
@@ -3053,8 +3052,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: usubo.not.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a1, a0
 ; RV64ZBA-NEXT:    xori a0, a0, 1
 ; RV64ZBA-NEXT:    ret
 ;
@@ -3075,8 +3073,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: usubo.not.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a1, a0
 ; RV64ZICOND-NEXT:    xori a0, a0, 1
 ; RV64ZICOND-NEXT:    ret
 entry:
@@ -4379,8 +4376,7 @@ continue:
 define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: usubo.br.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    bgeu a0, a1, .LBB58_2
+; RV32-NEXT:    bgeu a1, a0, .LBB58_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -4401,8 +4397,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: usubo.br.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    bgeu a0, a1, .LBB58_2
+; RV32ZBA-NEXT:    bgeu a1, a0, .LBB58_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
@@ -4423,8 +4418,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: usubo.br.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    bgeu a0, a1, .LBB58_2
+; RV32ZICOND-NEXT:    bgeu a1, a0, .LBB58_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
@@ -4478,8 +4472,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: usubo.br.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64-NEXT:    bgeu a1, a0, .LBB59_2
 ; RV64-NEXT:  # %bb.1: # %overflow
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
@@ -4509,8 +4502,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: usubo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64ZBA-NEXT:    bgeu a1, a0, .LBB59_2
 ; RV64ZBA-NEXT:  # %bb.1: # %overflow
 ; RV64ZBA-NEXT:    li a0, 0
 ; RV64ZBA-NEXT:    ret
@@ -4540,8 +4532,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: usubo.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB59_2
 ; RV64ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV64ZICOND-NEXT:    li a0, 0
 ; RV64ZICOND-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index 3bbf33328f529..6d5fc765c49a8 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -71,10 +71,10 @@ define i32 @subsat(i32 %a, i32 %b) {
 define i32 @subusat(i32 %a, i32 %b) {
 ; RV32I-LABEL: subusat:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IXQCIA-LABEL: subusat:
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index fc73ce5503ffe..da2123a5dfe74 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 ; CHECK-LABEL: @extract_value_usub(
 ; CHECK-NEXT:    [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT:    [[SUB:%.*]] = xor i8 [[ZZ]], -1
-; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[ZZ]], -1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %z = add nuw i8 %zz, 1
   %y = add i8 %x, %z
@@ -1090,11 +1090,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
 ; CHECK-LABEL: @extract_value_usub_fail(
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 0, [[Z]]
-; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[Z]], 0
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[SUB]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
index 62a332e14b04a..3224b8b63afd3 100644
--- a/llvm/test/Transforms/InstCombine/pr170634.ll
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -3,12 +3,13 @@
 define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
 ; CHECK-LABEL: @func(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br label [[RETURN:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
 ; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index f8b318bc3680a..30a5072c7edc8 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
 
 define i1 @t2(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
 
 define i1 @t2_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
 
 define i1 @t8(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
 
 define i1 @t8_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index c9030e5ab0321..90ca39a70a0bb 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,10 +175,11 @@ define i32 @test7(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -204,10 +205,11 @@ define i32 @test8(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -294,10 +296,11 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -325,10 +328,11 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -356,10 +360,11 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -387,10 +392,11 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -418,10 +424,11 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -449,10 +456,11 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index e4b9c0e08ba22..2074190a2cd45 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,9 +130,10 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
 ; CHECK-NEXT:    ret i1 [[EQ1]]
 ;
@@ -148,9 +149,10 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
 ; CHECK-NEXT:    ret i1 [[SGT0]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 0c82bdc256ddf..09ef32262ea78 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -506,10 +506,7 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 
 define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 ; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
-; CHECK-NEXT:    [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
+; CHECK-NEXT:    [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
 ; CHECK-NEXT:    ret { i32, i1 } [[A]]
 ;
   %a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)



More information about the llvm-commits mailing list