[llvm] [SelectionDAG] Fix condition used for unsigned subtraction overflow (PR #170896)

via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 11 07:12:50 PST 2025


https://github.com/aabhinavg1 updated https://github.com/llvm/llvm-project/pull/170896

>From b268573d6a03fe14e22a7703dcd6a284e5d0ca9a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:43:14 +0530
Subject: [PATCH 1/8] [Instcombine] Lower  to explicit subtraction + unsigned
 comparison

---
 .../InstCombine/InstCombineCalls.cpp          | 13 ++++
 .../test/Transforms/InstCombine/known-bits.ll | 15 +++--
 llvm/test/Transforms/InstCombine/pr170634.ll  | 33 ++++++++++
 ...ult-of-usub-is-non-zero-and-no-overflow.ll | 60 +++++++++----------
 .../usub-overflow-known-by-implied-cond.ll    | 40 +++++--------
 llvm/test/Transforms/InstCombine/usubo.ll     | 10 ++--
 .../Transforms/InstCombine/with_overflow.ll   |  7 ++-
 7 files changed, 108 insertions(+), 70 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/pr170634.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 743c4f574e131..af85985843914 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,6 +864,19 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
   if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
+    
+  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y} 
+  if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
+    IRBuilder<> Builder(WO);
+    Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
+    Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
+    
+    Value *ResultStruct = UndefValue::get(WO->getType());
+    ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
+    ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
+  
+    return replaceInstUsesWith(*WO, ResultStruct);
+  }
 
   // See whether we can optimize the overflow check with assumption information.
   for (User *U : WO->users()) {
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index da2123a5dfe74..fc73ce5503ffe 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 ; CHECK-LABEL: @extract_value_usub(
 ; CHECK-NEXT:    [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = xor i8 [[ZZ]], -1
+; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[ZZ]], -1
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %z = add nuw i8 %zz, 1
   %y = add i8 %x, %z
@@ -1090,12 +1090,11 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
 ; CHECK-LABEL: @extract_value_usub_fail(
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 0, [[Z]]
+; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[SUB]], 0
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[Z]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
new file mode 100644
index 0000000000000..62a332e14b04a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
+; CHECK-LABEL: @func(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %0 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+  %1 = extractvalue { i64, i1 } %0, 1
+  %2 = extractvalue { i64, i1 } %0, 0
+  br i1 %1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %return
+
+if.end:                                           ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i64 [ 291, %if.then ], [ %2, %if.end ]
+  ret i64 %retval.0
+}
+
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 30a5072c7edc8..46b8a853e6cf5 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
 
 define i1 @t2(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
 
 define i1 @t2_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
 
 define i1 @t8(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
 
 define i1 @t8_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index 90ca39a70a0bb..c9030e5ab0321 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,11 +175,10 @@ define i32 @test7(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -205,11 +204,10 @@ define i32 @test8(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -296,11 +294,10 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -328,11 +325,10 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -360,11 +356,10 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -392,11 +387,10 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -424,11 +418,10 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -456,11 +449,10 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index 2074190a2cd45..e4b9c0e08ba22 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,10 +130,9 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
 ; CHECK-NEXT:    ret i1 [[EQ1]]
 ;
@@ -149,10 +148,9 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
-; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
 ; CHECK-NEXT:    ret i1 [[SGT0]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index fa810408730e1..4f7a15cc89d6c 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; RUN: opt -passes='instcombine<no-verify-fixpoint>' -S < %s | FileCheck %s
 
 declare { i8, i1 } @llvm.uadd.with.overflow.i8(i8, i8) nounwind readnone
 declare { i8, i1 } @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
@@ -506,7 +506,10 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 
 define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 ; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT:    [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 42, [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT:    [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
 ; CHECK-NEXT:    ret { i32, i1 } [[A]]
 ;
   %a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)

>From e3fdf8dd13a1a8c3fc3ea7dd1916762d95276570 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:46:22 +0530
Subject: [PATCH 2/8] formated with git clang-format HEAD~1

---
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index af85985843914..3bd7eb855b147 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,17 +864,17 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
   if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
-    
-  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y} 
+
+  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
   if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
     IRBuilder<> Builder(WO);
     Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
     Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-    
+
     Value *ResultStruct = UndefValue::get(WO->getType());
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-  
+
     return replaceInstUsesWith(*WO, ResultStruct);
   }
 

>From aeef41f725b96ec57f72c2eb9788735419ae7172 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 00:27:48 +0530
Subject: [PATCH 3/8] fix formatting and replace undef with poison

---
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp |  2 +-
 .../result-of-usub-is-non-zero-and-no-overflow.ll    | 12 ++++++------
 llvm/test/Transforms/InstCombine/with_overflow.ll    |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3bd7eb855b147..d0b71f12c3159 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -871,7 +871,7 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
     Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
     Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
 
-    Value *ResultStruct = UndefValue::get(WO->getType());
+    Value *ResultStruct = PoisonValue::get(WO->getType());
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
     ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
 
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 46b8a853e6cf5..f8b318bc3680a 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -143,7 +143,7 @@ define i1 @t2(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -170,7 +170,7 @@ define i1 @t2_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2_logical(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -323,7 +323,7 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -350,7 +350,7 @@ define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability_logical(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -461,7 +461,7 @@ define i1 @t8(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
@@ -484,7 +484,7 @@ define i1 @t8_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8_logical(
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
 ; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 4f7a15cc89d6c..0c82bdc256ddf 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -508,7 +508,7 @@ define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 ; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
 ; CHECK-NEXT:    ret { i32, i1 } [[A]]
 ;

>From 87d56d3d369db1fef1789ccbc3f7890e30daa96a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 23:16:05 +0530
Subject: [PATCH 4/8] Address review feedback

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |    4 +-
 .../InstCombine/InstCombineCalls.cpp          |   13 -
 .../test/CodeGen/RISCV/arith-with-overflow.ll |    7 +-
 .../CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll |   24 +-
 llvm/test/CodeGen/RISCV/rvv/abs-vp.ll         |    4 +-
 llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll  |    8 +-
 llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll        |   44 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |   16 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       |   46 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        |   68 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll |   16 +-
 .../RISCV/rvv/fixed-vectors-bitreverse-vp.ll  |   16 +-
 .../RISCV/rvv/fixed-vectors-bswap-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-ceil-vp.ll        |   64 +-
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        |  432 ++-
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       |   76 +-
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        |  244 +-
 .../RISCV/rvv/fixed-vectors-floor-vp.ll       |   64 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |   77 +-
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |   77 +-
 .../RISCV/rvv/fixed-vectors-fpext-vp.ll       |    8 +-
 .../RISCV/rvv/fixed-vectors-fptosi-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-fptoui-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-fptrunc-vp.ll     |    8 +-
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |   32 +-
 .../rvv/fixed-vectors-reduction-fp-vp.ll      |   16 +-
 .../rvv/fixed-vectors-reduction-int-vp.ll     |    8 +-
 .../rvv/fixed-vectors-reduction-mask-vp.ll    |    8 +-
 .../RISCV/rvv/fixed-vectors-rint-vp.ll        |   32 +-
 .../RISCV/rvv/fixed-vectors-round-vp.ll       |   64 +-
 .../RISCV/rvv/fixed-vectors-roundeven-vp.ll   |   64 +-
 .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll |   64 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    |   16 +-
 .../RISCV/rvv/fixed-vectors-setcc-int-vp.ll   |   50 +-
 .../RISCV/rvv/fixed-vectors-sext-vp.ll        |   16 +-
 .../RISCV/rvv/fixed-vectors-sitofp-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-strided-vpload.ll |   74 +-
 .../rvv/fixed-vectors-strided-vpstore.ll      |   18 +-
 .../RISCV/rvv/fixed-vectors-trunc-vp.ll       |  299 +-
 .../RISCV/rvv/fixed-vectors-uitofp-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-vadd-vp.ll        |   32 +-
 .../RISCV/rvv/fixed-vectors-vcopysign-vp.ll   |   16 +-
 .../RISCV/rvv/fixed-vectors-vfabs-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfma-vp.ll        |   48 +-
 .../RISCV/rvv/fixed-vectors-vfmax-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfmin-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll    |   48 +-
 .../RISCV/rvv/fixed-vectors-vfneg-vp.ll       |   16 +-
 .../RISCV/rvv/fixed-vectors-vfsqrt-vp.ll      |   16 +-
 .../RISCV/rvv/fixed-vectors-vmax-vp.ll        |   24 +-
 .../RISCV/rvv/fixed-vectors-vmaxu-vp.ll       |   24 +-
 .../RISCV/rvv/fixed-vectors-vmin-vp.ll        |   24 +-
 .../RISCV/rvv/fixed-vectors-vminu-vp.ll       |   24 +-
 .../RISCV/rvv/fixed-vectors-vpgather.ll       |  184 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll |   24 +-
 .../RISCV/rvv/fixed-vectors-vpmerge.ll        |   16 +-
 .../RISCV/rvv/fixed-vectors-vpscatter.ll      |   64 +-
 .../RISCV/rvv/fixed-vectors-vpstore.ll        |    8 +-
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       |   32 +-
 .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll      |   32 +-
 .../RISCV/rvv/fixed-vectors-vselect-vp.ll     |   28 +-
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       |   32 +-
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      |   32 +-
 .../RISCV/rvv/fixed-vectors-zext-vp.ll        |   16 +-
 llvm/test/CodeGen/RISCV/rvv/floor-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |   68 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |   68 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |    4 +-
 llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll      |    2 +-
 llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll   |   52 +-
 .../RISCV/rvv/nontemporal-vp-scalable.ll      | 3010 ++++++++---------
 llvm/test/CodeGen/RISCV/rvv/rint-vp.ll        |   52 +-
 llvm/test/CodeGen/RISCV/rvv/round-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll   |   52 +-
 llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll |   52 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  869 +++--
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |  297 +-
 llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll |   50 +-
 .../test/CodeGen/RISCV/rvv/strided-vpstore.ll |   82 +-
 llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll        |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |  531 +--
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       |  276 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        | 2956 ++++------------
 llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll       |   52 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll       |  139 +-
 llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll    |    6 +-
 llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll      |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll     |    6 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll     |    6 +-
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll      |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       |  276 +-
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll        |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll       |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll        |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll       |   18 +-
 llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vp-splat.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vp-splice.ll      |  140 +-
 .../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll |   28 +-
 llvm/test/CodeGen/RISCV/rvv/vpload.ll         |   34 +-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll |   18 +-
 .../CodeGen/RISCV/rvv/vpscatter-sdnode.ll     |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vpstore.ll        |   20 +-
 .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll    |    4 +-
 .../CodeGen/RISCV/rvv/vreductions-int-vp.ll   |    2 +-
 .../CodeGen/RISCV/rvv/vreductions-mask-vp.ll  |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll      |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll      |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      |   12 +-
 llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll       |    4 +-
 llvm/test/CodeGen/RISCV/usub_sat.ll           |   48 +-
 llvm/test/CodeGen/RISCV/usub_sat_plus.ll      |   44 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              |  129 +-
 llvm/test/CodeGen/RISCV/xqcia.ll              |    6 +-
 .../test/Transforms/InstCombine/known-bits.ll |   15 +-
 llvm/test/Transforms/InstCombine/pr170634.ll  |    5 +-
 ...ult-of-usub-is-non-zero-and-no-overflow.ll |   60 +-
 .../usub-overflow-known-by-implied-cond.ll    |   40 +-
 llvm/test/Transforms/InstCombine/usubo.ll     |   10 +-
 .../Transforms/InstCombine/with_overflow.ll   |    5 +-
 132 files changed, 5135 insertions(+), 7744 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 172c7485e108b..8b46c4c1e66db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11466,7 +11466,9 @@ void TargetLowering::expandUADDSUBO(
                      DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETNE);
   } else {
     ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
-    SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
+    SDValue CompareLHS = IsAdd ? Result : LHS;
+    SDValue CompareRHS = IsAdd ? LHS : RHS;
+    SetCC = DAG.getSetCC(dl, SetCCType, CompareLHS, CompareRHS, CC);
   }
   Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d0b71f12c3159..743c4f574e131 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -865,19 +865,6 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
                             WO->getRHS(), *WO, OperationResult, OverflowResult))
     return createOverflowTuple(WO, OperationResult, OverflowResult);
 
-  // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
-  if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
-    IRBuilder<> Builder(WO);
-    Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
-    Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-
-    Value *ResultStruct = PoisonValue::get(WO->getType());
-    ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
-    ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-
-    return replaceInstUsesWith(*WO, ResultStruct);
-  }
-
   // See whether we can optimize the overflow check with assumption information.
   for (User *U : WO->users()) {
     if (!match(U, m_ExtractValue<1>(m_Value())))
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 557b4b7c2afa2..84526a1fca0f9 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -54,9 +54,10 @@ entry:
 define i1 @usub(i32 %a, i32 %b, ptr %c) nounwind {
 ; RV32I-LABEL: usub:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    sltu a3, a1, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    ret
 entry:
   %x = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index ea9786d0b10b3..f5f122a8c9dd7 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -715,7 +715,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    zext.b a0, a3
 ; RV32I-NEXT:    sub a1, a0, s1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sltu a0, s1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a2, a0, a1
 ; RV32I-NEXT:    sb a3, 3(sp)
@@ -755,7 +755,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV32IA-NEXT:    srl a4, a4, a0
 ; RV32IA-NEXT:    zext.b a4, a4
 ; RV32IA-NEXT:    sub a6, a4, a1
-; RV32IA-NEXT:    sltu a4, a4, a6
+; RV32IA-NEXT:    sltu a4, a1, a4
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    and a4, a4, a6
 ; RV32IA-NEXT:    sll a4, a4, a0
@@ -792,7 +792,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    zext.b a0, a3
 ; RV64I-NEXT:    sub a1, a0, s1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sltu a0, s1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a2, a0, a1
 ; RV64I-NEXT:    sb a3, 7(sp)
@@ -832,7 +832,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV64IA-NEXT:    sext.w a6, a3
 ; RV64IA-NEXT:    zext.b a5, a5
 ; RV64IA-NEXT:    sub a7, a5, a1
-; RV64IA-NEXT:    sltu a5, a5, a7
+; RV64IA-NEXT:    sltu a5, a1, a5
 ; RV64IA-NEXT:    addi a5, a5, -1
 ; RV64IA-NEXT:    and a5, a5, a7
 ; RV64IA-NEXT:    sllw a5, a5, a0
@@ -877,7 +877,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    and a0, a3, s1
 ; RV32I-NEXT:    sub a1, a0, s2
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sltu a0, s2, a0
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a2, a0, a1
 ; RV32I-NEXT:    sh a3, 14(sp)
@@ -920,7 +920,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    srl a5, a5, a0
 ; RV32IA-NEXT:    and a5, a5, a3
 ; RV32IA-NEXT:    sub a7, a5, a1
-; RV32IA-NEXT:    sltu a5, a5, a7
+; RV32IA-NEXT:    sltu a5, a1, a5
 ; RV32IA-NEXT:    addi a5, a5, -1
 ; RV32IA-NEXT:    and a5, a5, a7
 ; RV32IA-NEXT:    sll a5, a5, a0
@@ -961,7 +961,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    and a0, a3, s1
 ; RV64I-NEXT:    sub a1, a0, s2
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sltu a0, s2, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a2, a0, a1
 ; RV64I-NEXT:    sh a3, 14(sp)
@@ -1004,7 +1004,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    sext.w a7, a4
 ; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    sub t0, a6, a1
-; RV64IA-NEXT:    sltu a6, a6, t0
+; RV64IA-NEXT:    sltu a6, a1, a6
 ; RV64IA-NEXT:    addi a6, a6, -1
 ; RV64IA-NEXT:    and a6, a6, t0
 ; RV64IA-NEXT:    sllw a6, a6, a0
@@ -1044,7 +1044,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; RV32I-NEXT:  .LBB6_1: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    sub a0, a3, s1
-; RV32I-NEXT:    sltu a1, a3, a0
+; RV32I-NEXT:    sltu a1, s1, a3
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a2, a1, a0
 ; RV32I-NEXT:    sw a3, 0(sp)
@@ -1075,7 +1075,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; RV32IA-NEXT:    # Child Loop BB6_3 Depth 2
 ; RV32IA-NEXT:    mv a3, a2
 ; RV32IA-NEXT:    sub a2, a2, a1
-; RV32IA-NEXT:    sltu a4, a3, a2
+; RV32IA-NEXT:    sltu a4, a1, a3
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    and a4, a4, a2
 ; RV32IA-NEXT:  .LBB6_3: # %atomicrmw.start
@@ -1298,7 +1298,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV64I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    sub a0, a3, s1
-; RV64I-NEXT:    sltu a1, a3, a0
+; RV64I-NEXT:    sltu a1, s1, a3
 ; RV64I-NEXT:    addi a1, a1, -1
 ; RV64I-NEXT:    and a2, a1, a0
 ; RV64I-NEXT:    sd a3, 0(sp)
@@ -1329,7 +1329,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV64IA-NEXT:    # Child Loop BB7_3 Depth 2
 ; RV64IA-NEXT:    mv a3, a2
 ; RV64IA-NEXT:    sub a2, a2, a1
-; RV64IA-NEXT:    sltu a4, a3, a2
+; RV64IA-NEXT:    sltu a4, a1, a3
 ; RV64IA-NEXT:    addi a4, a4, -1
 ; RV64IA-NEXT:    and a4, a4, a2
 ; RV64IA-NEXT:  .LBB7_3: # %atomicrmw.start
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 5b215c5173211..0fb4b2a06b76f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -519,7 +519,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -543,7 +543,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64_unmasked(<vscale x 16 x i64> %va, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 09b8fdbf11d26..025f944bcd51c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -3018,7 +3018,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-NEXT:    slli a3, a3, 2
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a4
 ; CHECK-NEXT:    sub a4, a0, a3
-; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    sltu a5, a3, a0
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a5, a5, a4
 ; CHECK-NEXT:    lui a6, 5
@@ -3079,7 +3079,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -3104,7 +3104,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
 ; CHECK-NEXT:    lui a2, 3
 ; CHECK-NEXT:    slli a3, a3, 2
 ; CHECK-NEXT:    sub a4, a0, a3
-; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    sltu a5, a3, a0
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a5, a5, a4
 ; CHECK-NEXT:    lui a6, 5
@@ -3160,7 +3160,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0177b8cfd4393..668a770610f20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1534,7 +1534,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -1561,7 +1561,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-ZVKB-NEXT:    sub a2, a0, a1
-; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVKB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVKB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVKB-NEXT:    and a2, a3, a2
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -1584,7 +1584,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -1606,7 +1606,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
 ; CHECK-ZVKB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    sub a2, a0, a1
-; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVKB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVKB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVKB-NEXT:    and a2, a3, a2
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 6c7709f52e30b..d3813b703c5be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1585,7 +1585,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; RV32ZFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZFH-NEXT:    sub a2, a0, a1
 ; RV32ZFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZFH-NEXT:    sltu a3, a0, a2
+; RV32ZFH-NEXT:    sltu a3, a1, a0
 ; RV32ZFH-NEXT:    addi a3, a3, -1
 ; RV32ZFH-NEXT:    and a2, a3, a2
 ; RV32ZFH-NEXT:    vmv1r.v v0, v6
@@ -1631,7 +1631,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; RV64ZFH-NEXT:    sub a3, a0, a1
 ; RV64ZFH-NEXT:    slli a2, a2, 52
 ; RV64ZFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZFH-NEXT:    sltu a2, a0, a3
+; RV64ZFH-NEXT:    sltu a2, a1, a0
 ; RV64ZFH-NEXT:    addi a2, a2, -1
 ; RV64ZFH-NEXT:    and a2, a2, a3
 ; RV64ZFH-NEXT:    vmv1r.v v0, v6
@@ -1676,7 +1676,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; RV32ZFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZFH-NEXT:    sub a3, a0, a1
 ; RV32ZFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZFH-NEXT:    sltu a2, a0, a3
+; RV32ZFH-NEXT:    sltu a2, a1, a0
 ; RV32ZFH-NEXT:    addi a2, a2, -1
 ; RV32ZFH-NEXT:    and a2, a2, a3
 ; RV32ZFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1710,7 +1710,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; RV64ZFH-NEXT:    sub a3, a0, a1
 ; RV64ZFH-NEXT:    slli a2, a2, 52
 ; RV64ZFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZFH-NEXT:    sltu a2, a0, a3
+; RV64ZFH-NEXT:    sltu a2, a1, a0
 ; RV64ZFH-NEXT:    addi a2, a2, -1
 ; RV64ZFH-NEXT:    and a2, a2, a3
 ; RV64ZFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 20f397b694180..f8293f6c671f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -1195,7 +1195,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    sub a5, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    sltu a3, a0, a5
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a5, a3, a5
 ; CHECK-NEXT:    li a3, 1086
@@ -1228,7 +1228,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1252,7 +1252,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-NEXT:    fsrmi a4, 1
 ; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a5, a0, a3
+; CHECK-NEXT:    sltu a5, a1, a0
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a5, a5, a3
 ; CHECK-NEXT:    li a3, 1086
@@ -1280,7 +1280,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2402,7 +2402,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a4, a2, a4
 ; CHECK-NEXT:    li a2, 52
@@ -2433,7 +2433,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2456,7 +2456,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a4, a4, a2
 ; CHECK-NEXT:    li a2, 52
@@ -2482,7 +2482,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 1bbefc65d3e39..d16418f57033a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1990,7 +1990,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    sltu a2, a0, a3
+; RV32-NEXT:    sltu a2, a1, a0
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2117,10 +2117,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sub a6, a0, a1
+; RV64-NEXT:    sltu a1, a1, a0
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a6
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -2129,11 +2134,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
@@ -2144,9 +2144,9 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
 ; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
@@ -2158,7 +2158,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
 ; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64:
@@ -2169,7 +2169,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2200,10 +2200,10 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    sub a4, a0, a1
 ; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    sltu a2, a0, a4
+; RV32-NEXT:    sltu a2, a1, a0
+; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2308,10 +2308,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV64-NEXT:    lui a4, 209715
 ; RV64-NEXT:    lui a5, 61681
 ; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    sub a7, a0, a2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    addi a3, a3, 1365
 ; RV64-NEXT:    addi a4, a4, 819
 ; RV64-NEXT:    addi a5, a5, -241
 ; RV64-NEXT:    addi a6, a6, 257
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    slli a7, a3, 32
 ; RV64-NEXT:    add a3, a3, a7
 ; RV64-NEXT:    slli a7, a4, 32
@@ -2320,11 +2325,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV64-NEXT:    add a5, a5, a7
 ; RV64-NEXT:    slli a7, a6, 32
 ; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    sub a2, a0, a2
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2346,26 +2346,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a5
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vmul.vx v16, v16, a6
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsrl.vx v8, v8, a2
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v16, v16, a6
-; RV64-NEXT:    vsrl.vx v16, v16, a7
+; RV64-NEXT:    vsrl.vx v16, v16, a2
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index c82ad17545a6a..464c4d1f5f899 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2154,7 +2154,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    srli a2, a1, 3
 ; RV32-NEXT:    sub a3, a0, a1
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    sltu a2, a0, a3
+; RV32-NEXT:    sltu a2, a1, a0
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    lui a3, 349525
@@ -2190,31 +2190,31 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a3, a3, 257
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
@@ -2226,11 +2226,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vi v8, v16, -1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
@@ -2286,11 +2286,14 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    srli a6, a1, 3
 ; RV64-NEXT:    sub a7, a0, a1
+; RV64-NEXT:    vslidedown.vx v0, v0, a6
+; RV64-NEXT:    sltu a6, a1, a0
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi t0, a5, 257
-; RV64-NEXT:    vslidedown.vx v0, v0, a6
+; RV64-NEXT:    addi a6, a6, -1
+; RV64-NEXT:    and a7, a6, a7
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a6, a2, a6
 ; RV64-NEXT:    slli a5, a3, 32
@@ -2299,9 +2302,6 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    slli a3, t0, 32
 ; RV64-NEXT:    add a3, t0, a3
-; RV64-NEXT:    sltu a4, a0, a7
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a7, a4, a7
 ; RV64-NEXT:    li a4, 56
 ; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
@@ -2350,7 +2350,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2381,10 +2381,10 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    sub a4, a0, a1
 ; RV32-NEXT:    addi a2, a2, 1365
-; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    sltu a2, a0, a4
+; RV32-NEXT:    sltu a2, a1, a0
+; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2489,21 +2489,21 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    sub a6, a0, a1
+; RV64-NEXT:    sltu a7, a1, a0
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a7, a4, -241
-; RV64-NEXT:    addi t0, a5, 257
+; RV64-NEXT:    addi t0, a4, -241
+; RV64-NEXT:    addi t1, a5, 257
+; RV64-NEXT:    addi a7, a7, -1
+; RV64-NEXT:    and a6, a7, a6
 ; RV64-NEXT:    slli a5, a2, 32
 ; RV64-NEXT:    add a5, a2, a5
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a7, 32
-; RV64-NEXT:    add a2, a7, a2
-; RV64-NEXT:    slli a3, t0, 32
-; RV64-NEXT:    add a3, t0, a3
-; RV64-NEXT:    sltu a7, a0, a6
-; RV64-NEXT:    addi a7, a7, -1
-; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    slli a2, t0, 32
+; RV64-NEXT:    add a2, t0, a2
+; RV64-NEXT:    slli a3, t1, 32
+; RV64-NEXT:    add a3, t1, a3
 ; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1
 ; RV64-NEXT:    vnot.v v16, v16
@@ -2547,7 +2547,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -3731,7 +3731,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a4, a2, a4
 ; CHECK-NEXT:    li a2, 52
@@ -3766,7 +3766,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
 ; CHECK-ZVBB-NEXT:    sub a3, a0, a1
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    sltu a2, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a2, a2, -1
 ; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -3789,7 +3789,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a4, a4, a2
 ; CHECK-NEXT:    li a2, 52
@@ -3819,7 +3819,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
+; CHECK-ZVBB-NEXT:    sltu a3, a1, a0
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
 ; CHECK-ZVBB-NEXT:    and a2, a3, a2
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
index fa81e1f6f3514..912a63b09f1a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
@@ -392,10 +392,10 @@ define <32 x i64> @vp_abs_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v8, 0, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v16, 0, v0.t
@@ -417,10 +417,10 @@ define <32 x i64> @vp_abs_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v8, 0
 ; CHECK-NEXT:    vmax.vv v8, v8, v24
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vmax.vv v16, v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index f436bbb9a66ca..8e322b64ef551 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -2386,10 +2386,10 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    lui a1, 1
 ; CHECK-NEXT:    lui a2, 3
-; CHECK-NEXT:    addi a3, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a3
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    sltiu a3, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a3, a3, a0
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    addi a1, a1, -241
@@ -2450,10 +2450,10 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    lui a2, 1
 ; CHECK-NEXT:    lui a3, 3
-; CHECK-NEXT:    addi a4, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a4
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    sltiu a4, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a4, a4
+; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    lui a4, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v24
 ; CHECK-NEXT:    addi a2, a2, -241
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index eca94ccb9bf7f..c1c9e581decf8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -1275,10 +1275,10 @@ define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a1, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
@@ -1302,10 +1302,10 @@ define <128 x i16> @vp_bswap_v128i16_unmasked(<128 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    addi a1, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 466d5d4b8e80a..b58de7abf0442 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 3
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 3
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 3
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 3
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 3
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 3
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 00c36cb7f7327..d1fadc962c2eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1979,10 +1979,10 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
@@ -2065,22 +2065,22 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a6, a4, -241
-; RV64-NEXT:    addi a7, a5, 257
+; RV64-NEXT:    addi a7, a4, -241
+; RV64-NEXT:    addi t0, a5, 257
+; RV64-NEXT:    neg a4, a6
+; RV64-NEXT:    and a6, a4, a0
 ; RV64-NEXT:    slli a5, a2, 32
 ; RV64-NEXT:    add a5, a2, a5
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a6, 32
-; RV64-NEXT:    add a2, a6, a2
-; RV64-NEXT:    slli a3, a7, 32
-; RV64-NEXT:    add a3, a7, a3
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
@@ -2150,9 +2150,9 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
@@ -2160,110 +2160,102 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsrl.vi v0, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 1
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vsrl.vi v0, v8, 8
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 4
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vsrl.vi v0, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 8
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vx v0, v16, a2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vor.vv v24, v16, v8
+; RV32-NEXT:    vnot.v v0, v16
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
+; RV32-NEXT:    vand.vv v0, v24, v16
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a2, a2, -241
 ; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
+; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2285,95 +2277,95 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    lui a4, 209715
 ; RV64-NEXT:    lui a5, 61681
 ; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addi a7, a3, 1365
-; RV64-NEXT:    addi a3, a4, 819
-; RV64-NEXT:    addi a4, a5, -241
-; RV64-NEXT:    addi a6, a6, 257
-; RV64-NEXT:    slli a5, a7, 32
-; RV64-NEXT:    add a7, a7, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a5, a3, a5
-; RV64-NEXT:    slli a3, a4, 32
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    slli a4, a6, 32
-; RV64-NEXT:    add a4, a6, a4
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    sltiu a7, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    addi a3, a3, 1365
+; RV64-NEXT:    addi a4, a4, 819
+; RV64-NEXT:    addi a5, a5, -241
+; RV64-NEXT:    addi t0, a6, 257
+; RV64-NEXT:    neg a6, a7
+; RV64-NEXT:    and a0, a6, a0
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a7, a3, a6
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a6, a4, a6
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    slli a4, t0, 32
+; RV64-NEXT:    add a4, t0, a4
+; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vsrl.vi v24, v8, 8
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vsrl.vi v24, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v16, a2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a5
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vand.vx v24, v8, a6
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a6
 ; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a6
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a6
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a0
+; RV64-NEXT:    vsrl.vx v16, v16, a5
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
@@ -4354,10 +4346,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
@@ -4440,22 +4432,22 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
-; RV64-NEXT:    addi a6, a4, -241
-; RV64-NEXT:    addi a7, a5, 257
+; RV64-NEXT:    addi a7, a4, -241
+; RV64-NEXT:    addi t0, a5, 257
+; RV64-NEXT:    neg a4, a6
+; RV64-NEXT:    and a6, a4, a0
 ; RV64-NEXT:    slli a5, a2, 32
 ; RV64-NEXT:    add a5, a2, a5
 ; RV64-NEXT:    slli a4, a3, 32
 ; RV64-NEXT:    add a4, a3, a4
-; RV64-NEXT:    slli a2, a6, 32
-; RV64-NEXT:    add a2, a6, a2
-; RV64-NEXT:    slli a3, a7, 32
-; RV64-NEXT:    add a3, a7, a3
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
@@ -4525,9 +4517,9 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
@@ -4535,110 +4527,102 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi a3, a3, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    sltiu a3, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsrl.vi v0, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 1
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vsrl.vi v0, v8, 8
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 4
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vsrl.vi v0, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 8
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v16, 16
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vx v0, v16, a2
 ; RV32-NEXT:    vor.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    vand.vv v0, v0, v24
-; RV32-NEXT:    vsub.vv v0, v8, v0
+; RV32-NEXT:    vsub.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
-; RV32-NEXT:    vor.vv v24, v16, v8
+; RV32-NEXT:    vnot.v v0, v16
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
+; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v0, 2
-; RV32-NEXT:    vand.vv v0, v0, v8
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v24, v24, v0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v16, 4
-; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v24, v8
+; RV32-NEXT:    vand.vv v0, v24, v16
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v8, v24, v8
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a2, a2, -241
 ; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v24, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v24
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v16, a2
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v24, a2
+; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -4660,95 +4644,95 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:    lui a4, 209715
 ; RV64-NEXT:    lui a5, 61681
 ; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addi a7, a3, 1365
-; RV64-NEXT:    addi a3, a4, 819
-; RV64-NEXT:    addi a4, a5, -241
-; RV64-NEXT:    addi a6, a6, 257
-; RV64-NEXT:    slli a5, a7, 32
-; RV64-NEXT:    add a7, a7, a5
-; RV64-NEXT:    slli a5, a3, 32
-; RV64-NEXT:    add a5, a3, a5
-; RV64-NEXT:    slli a3, a4, 32
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    slli a4, a6, 32
-; RV64-NEXT:    add a4, a6, a4
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a6, a0, a6
-; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    sltiu a7, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    addi a3, a3, 1365
+; RV64-NEXT:    addi a4, a4, 819
+; RV64-NEXT:    addi a5, a5, -241
+; RV64-NEXT:    addi t0, a6, 257
+; RV64-NEXT:    neg a6, a7
+; RV64-NEXT:    and a0, a6, a0
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a7, a3, a6
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a6, a4, a6
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    slli a4, t0, 32
+; RV64-NEXT:    add a4, t0, a4
+; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vsrl.vi v24, v8, 8
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vsrl.vi v24, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v24, v16, a2
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a5
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vand.vx v24, v8, a6
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a6
 ; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v16, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a6
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
+; RV64-NEXT:    vand.vx v16, v16, a6
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a0
+; RV64-NEXT:    vsrl.vx v16, v16, a5
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index f56438bf87e6a..61bc86333d95f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1430,7 +1430,10 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, a2, 819
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a2
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
@@ -1455,24 +1458,21 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, a2, 257
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -1481,15 +1481,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
@@ -1504,15 +1507,12 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -1541,10 +1541,14 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
 ; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    sltiu a5, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, 819
 ; RV64-NEXT:    addi a3, a3, -241
 ; RV64-NEXT:    addi a4, a4, 257
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a0, a5, a0
 ; RV64-NEXT:    slli a5, a1, 32
 ; RV64-NEXT:    add a1, a1, a5
 ; RV64-NEXT:    slli a5, a2, 32
@@ -1553,10 +1557,6 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    add a3, a3, a5
 ; RV64-NEXT:    slli a5, a4, 32
 ; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    addi a5, a0, -16
-; RV64-NEXT:    sltu a0, a0, a5
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a5
 ; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
@@ -1603,10 +1603,10 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a2
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a2, a2, 819
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1628,13 +1628,13 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vand.vv v24, v16, v0
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    vand.vv v16, v16, v0
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    lui a2, 61681
 ; RV32-NEXT:    addi a2, a2, -241
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
 ; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
@@ -1672,10 +1672,14 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    neg a6, a6
+; RV64-NEXT:    and a0, a6, a0
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -1684,10 +1688,6 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v8, v8, v24
@@ -1710,18 +1710,18 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
+; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsrl.vx v16, v16, a6
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 098384d200045..0e3eadcce484e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1604,10 +1604,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -1616,26 +1616,26 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1679,31 +1679,31 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
 ; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    sltiu a5, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a5, a3, -241
+; RV64-NEXT:    addi a3, a3, -241
 ; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a6, a1, a3
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a3, a2, a3
-; RV64-NEXT:    slli a1, a5, 32
-; RV64-NEXT:    add a1, a5, a1
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a5, a5, a0
+; RV64-NEXT:    slli a0, a1, 32
+; RV64-NEXT:    add a6, a1, a0
+; RV64-NEXT:    slli a0, a2, 32
+; RV64-NEXT:    add a7, a2, a0
+; RV64-NEXT:    slli a1, a3, 32
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    slli a2, a4, 32
 ; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    addi a4, a0, -16
-; RV64-NEXT:    sltu a0, a0, a4
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a4, a0, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a7, v0.t
 ; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
@@ -1711,16 +1711,16 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
 ; RV64-NEXT:    vnot.v v16, v16, v0.t
 ; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
 ; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
@@ -1744,9 +1744,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vi v24, v8, -1
 ; RV32-NEXT:    vnot.v v0, v8
@@ -1754,15 +1754,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v0, v24
@@ -1774,8 +1769,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vadd.vi v0, v16, -1
 ; RV32-NEXT:    vnot.v v16, v16
 ; RV32-NEXT:    vand.vv v0, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1783,16 +1780,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
@@ -1826,7 +1816,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1848,10 +1838,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    neg a6, a6
+; RV64-NEXT:    and a0, a6, a0
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -1860,47 +1854,43 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v24, v16, a3
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
 ; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsrl.vx v16, v16, a6
@@ -3509,10 +3499,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
@@ -3521,26 +3511,26 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT:    vadd.vi v16, v8, -1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -3584,31 +3574,31 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
 ; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    sltiu a5, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a1, a1, 1365
 ; RV64-NEXT:    addi a2, a2, 819
-; RV64-NEXT:    addi a5, a3, -241
+; RV64-NEXT:    addi a3, a3, -241
 ; RV64-NEXT:    addi a4, a4, 257
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a6, a1, a3
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a3, a2, a3
-; RV64-NEXT:    slli a1, a5, 32
-; RV64-NEXT:    add a1, a5, a1
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a5, a5, a0
+; RV64-NEXT:    slli a0, a1, 32
+; RV64-NEXT:    add a6, a1, a0
+; RV64-NEXT:    slli a0, a2, 32
+; RV64-NEXT:    add a7, a2, a0
+; RV64-NEXT:    slli a1, a3, 32
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    slli a2, a4, 32
 ; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    addi a4, a0, -16
-; RV64-NEXT:    sltu a0, a0, a4
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a4, a0, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a7, v0.t
 ; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
@@ -3616,16 +3606,16 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v7
-; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1, v0.t
 ; RV64-NEXT:    vnot.v v16, v16, v0.t
 ; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
 ; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
 ; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
@@ -3649,9 +3639,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vi v24, v8, -1
 ; RV32-NEXT:    vnot.v v0, v8
@@ -3659,15 +3649,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v24, v0, v24
@@ -3679,8 +3664,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vadd.vi v0, v16, -1
 ; RV32-NEXT:    vnot.v v16, v16
 ; RV32-NEXT:    vand.vv v0, v16, v0
+; RV32-NEXT:    vsrl.vi v16, v0, 1
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -3688,16 +3675,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v24
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vv v24, v0, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
@@ -3731,7 +3711,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v16, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3753,10 +3733,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    sltiu a6, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
 ; RV64-NEXT:    addi a2, a2, 1365
 ; RV64-NEXT:    addi a3, a3, 819
 ; RV64-NEXT:    addi a4, a4, -241
 ; RV64-NEXT:    addi a5, a5, 257
+; RV64-NEXT:    neg a6, a6
+; RV64-NEXT:    and a0, a6, a0
 ; RV64-NEXT:    slli a6, a2, 32
 ; RV64-NEXT:    add a2, a2, a6
 ; RV64-NEXT:    slli a6, a3, 32
@@ -3765,47 +3749,43 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    slli a6, a5, 32
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vi v24, v16, -1
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v24, v16, a3
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
 ; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v8, v8, a4
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vx v8, v8, a6
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v16, v16, v24
 ; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vmul.vx v16, v16, a5
 ; RV64-NEXT:    vsrl.vx v16, v16, a6
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 76f5f0a32bd1c..5a0749068b41d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 2
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 2
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 2
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 2
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 2
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 2
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 2
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 2
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index da6e2fae93687..ad7ee735707f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 }
 
 define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmax_vv_v32f64_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB25_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT:    vfmax.vv v16, v16, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.maximum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index e179970199171..9a5304e0d94e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 }
 
 define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmin_vv_v32f64_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB25_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT:    vfmin.vv v16, v16, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.minimum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
index 465b166826a37..6d87ecfd3bc6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
@@ -96,10 +96,10 @@ define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vmv1r.v v0, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
index 96eda109e1c70..044b9fefa1220 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
index 4020100bf364b..55f4d9e0805c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
index e509722b623a2..aab5bbdfebacd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
@@ -97,10 +97,10 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v24, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 57c94830fc606..e3ed908a5bddb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -741,10 +741,10 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    sltiu a1, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32-NEXT:    frflags a1
@@ -787,12 +787,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    li a1, 1075
 ; RV64-NEXT:    slli a1, a1, 52
 ; RV64-NEXT:    fmv.d.x fa5, a1
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a1, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a1, a1
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    frflags a1
 ; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -832,10 +832,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV32-NEXT:    vfabs.v v24, v8
 ; RV32-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    frflags a2
 ; RV32-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -870,10 +870,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV64-NEXT:    li a2, 1075
 ; RV64-NEXT:    slli a2, a2, 52
 ; RV64-NEXT:    fmv.d.x fa5, a2
-; RV64-NEXT:    addi a2, a0, -16
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    sltiu a2, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    frflags a2
 ; RV64-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index ca9b24e60e503..4e90727b6ebf1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -123,10 +123,10 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32
 ; CHECK-NEXT:    vfmv.s.f v25, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    addi a1, a0, -32
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 33
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v25, v16, v25, v0.t
@@ -151,10 +151,10 @@ define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m,
 ; CHECK-NEXT:    vfmv.s.f v25, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    addi a1, a0, -32
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 33
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 3e77020ed0213..27211f153b526 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -654,12 +654,12 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
 ; CHECK-NEXT:  .LBB49_2:
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v25, a0
-; CHECK-NEXT:    addi a0, a1, -32
+; CHECK-NEXT:    sltiu a0, a1, 33
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vredxor.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    sltu a1, a1, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a1, a1, -32
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vredxor.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
index 8523ca957a8f5..b5cd2e783ff66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
@@ -211,15 +211,15 @@ define zeroext i1 @vpreduce_and_v256i1(i1 zeroext %s, <256 x i1> %v, <256 x i1>
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB14_2:
 ; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    sltiu a3, a1, 129
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a1, a3, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmnot.m v9, v9
 ; CHECK-NEXT:    vcpop.m a2, v9, v0.t
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 7540495c0d3b5..41e8d1f982e32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -669,12 +669,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT:    addi a1, a0, -16
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    sltiu a1, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a1, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    vmv1r.v v0, v6
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -711,12 +711,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; RV64-NEXT:    li a1, 1075
 ; RV64-NEXT:    slli a1, a1, 52
 ; RV64-NEXT:    fmv.d.x fa5, a1
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a1, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vmv1r.v v0, v6
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vfcvt.x.f.v v24, v8, v0.t
@@ -752,10 +752,10 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV32-NEXT:    vfabs.v v24, v8
 ; RV32-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT:    addi a2, a0, -16
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    sltiu a2, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vfabs.v v24, v16
@@ -786,11 +786,11 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; RV64-NEXT:    li a2, 1075
 ; RV64-NEXT:    slli a2, a2, 52
 ; RV64-NEXT:    fmv.d.x fa5, a2
-; RV64-NEXT:    addi a2, a0, -16
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a2, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    vmflt.vf v0, v24, fa5
-; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vfabs.v v24, v16
 ; RV64-NEXT:    vmflt.vf v7, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index de5427f329496..2d4941744292e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 4
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 4
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 4
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 4
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 4
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 4
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 4
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 4
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 1c923e3f12171..45ea933f427ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 0
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 0
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 0
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 0
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 0
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 0
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 0
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 0
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 83cbd2b760341..3dc45f97e6964 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFH-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT:    addi a1, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a1
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a1
+; RV32ZVFH-NEXT:    sltiu a1, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a1, a1
+; RV32ZVFH-NEXT:    and a0, a1, a0
 ; RV32ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFH-NEXT:    fsrmi a1, 1
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV64ZVFH-NEXT:    li a1, 1075
 ; RV64ZVFH-NEXT:    slli a1, a1, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFH-NEXT:    addi a1, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a1
-; RV64ZVFH-NEXT:    addi a0, a0, -1
+; RV64ZVFH-NEXT:    sltiu a1, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a1, a1
+; RV64ZVFH-NEXT:    and a0, a1, a0
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFH-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT:    and a0, a0, a1
 ; RV64ZVFH-NEXT:    fsrmi a1, 1
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFH-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8, v0.t
 ; RV32ZVFHMIN-NEXT:    lui a1, %hi(.LCPI26_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a1
+; RV32ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a1, a1
+; RV32ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; RV32ZVFHMIN-NEXT:    fsrmi a1, 1
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; RV64ZVFHMIN-NEXT:    li a1, 1075
 ; RV64ZVFHMIN-NEXT:    slli a1, a1, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT:    addi a1, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
+; RV64ZVFHMIN-NEXT:    sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a1, a1
+; RV64ZVFHMIN-NEXT:    and a0, a1, a0
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT:    and a0, a0, a1
 ; RV64ZVFHMIN-NEXT:    fsrmi a1, 1
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV32ZVFH-NEXT:    vfabs.v v24, v8
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT:    addi a2, a0, -16
-; RV32ZVFH-NEXT:    sltu a0, a0, a2
-; RV32ZVFH-NEXT:    addi a0, a0, -1
-; RV32ZVFH-NEXT:    and a0, a0, a2
+; RV32ZVFH-NEXT:    sltiu a2, a0, 17
+; RV32ZVFH-NEXT:    addi a0, a0, -16
+; RV32ZVFH-NEXT:    neg a2, a2
+; RV32ZVFH-NEXT:    and a0, a2, a0
 ; RV32ZVFH-NEXT:    fsrmi a2, 1
 ; RV32ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV64ZVFH-NEXT:    li a2, 1075
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    addi a2, a0, -16
-; RV64ZVFH-NEXT:    sltu a0, a0, a2
-; RV64ZVFH-NEXT:    addi a0, a0, -1
-; RV64ZVFH-NEXT:    and a0, a0, a2
+; RV64ZVFH-NEXT:    sltiu a2, a0, 17
+; RV64ZVFH-NEXT:    addi a0, a0, -16
+; RV64ZVFH-NEXT:    neg a2, a2
+; RV64ZVFH-NEXT:    and a0, a2, a0
 ; RV64ZVFH-NEXT:    fsrmi a2, 1
 ; RV64ZVFH-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFH-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV32ZVFHMIN-NEXT:    vfabs.v v24, v8
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI27_0)
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV32ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV32ZVFHMIN-NEXT:    and a0, a0, a2
+; RV32ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV32ZVFHMIN-NEXT:    neg a2, a2
+; RV32ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV32ZVFHMIN-NEXT:    fsrmi a2, 1
 ; RV32ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; RV64ZVFHMIN-NEXT:    li a2, 1075
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    addi a2, a0, -16
-; RV64ZVFHMIN-NEXT:    sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT:    addi a0, a0, -1
-; RV64ZVFHMIN-NEXT:    and a0, a0, a2
+; RV64ZVFHMIN-NEXT:    sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT:    addi a0, a0, -16
+; RV64ZVFHMIN-NEXT:    neg a2, a2
+; RV64ZVFHMIN-NEXT:    and a0, a2, a0
 ; RV64ZVFHMIN-NEXT:    fsrmi a2, 1
 ; RV64ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index af3e9db9fe123..79f1b88a765b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1076,10 +1076,10 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH-NEXT:  .LBB43_2:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v6, v8, v24, v0.t
-; ZVFH-NEXT:    addi a0, a2, -64
-; ZVFH-NEXT:    sltu a1, a2, a0
-; ZVFH-NEXT:    addi a1, a1, -1
-; ZVFH-NEXT:    and a0, a1, a0
+; ZVFH-NEXT:    sltiu a0, a2, 65
+; ZVFH-NEXT:    neg a0, a0
+; ZVFH-NEXT:    addi a1, a2, -64
+; ZVFH-NEXT:    and a0, a0, a1
 ; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -3929,10 +3929,10 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
 ; CHECK-NEXT:  .LBB87_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v6, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index efc0f7ef4a441..9f354d160d7c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -598,13 +598,13 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    addi a4, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    addi a2, a3, -128
+; CHECK-NEXT:    sltiu a2, a3, 129
 ; CHECK-NEXT:    vle8.v v24, (a4)
-; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    addi a4, a3, -128
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a2, a4, a2
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    neg a0, a2
+; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
@@ -636,10 +636,10 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB52_2
@@ -666,10 +666,10 @@ define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m,
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB53_2
@@ -1250,10 +1250,10 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; CHECK-NEXT:  .LBB99_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v6, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -32
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 33
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -32
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -1286,10 +1286,10 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB100_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT:    addi a2, a1, -32
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 33
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -32
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v8, v16, a0, v0.t
@@ -1316,10 +1316,10 @@ define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i
 ; CHECK-NEXT:  .LBB101_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT:    addi a2, a1, -32
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 33
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -32
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v8, v16, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
index a452e5a9ffbb8..9a08596ebb473 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vsext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vsext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
 ; CHECK-NEXT:  .LBB13_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v24, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
index afa8f2fda2ed4..8202ba4e2d815 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vsitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vsitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 8af4ced77be39..45c106240efc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -487,25 +487,24 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    li a4, 16
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB45_2
+; CHECK-NEXT:    sltiu a3, a2, 17
+; CHECK-NEXT:    addi a4, a2, -16
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    li a5, 16
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    bltu a2, a5, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:  .LBB45_2:
-; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    addi a5, a2, -16
+; CHECK-NEXT:    mul a4, a2, a1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 2
 ; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vlse64.v v16, (a4), a1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vlse64.v v8, (a0), a1, v0.t
 ; CHECK-NEXT:    ret
   %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %m, i32 %evl)
@@ -515,21 +514,20 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
 define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) nounwind {
 ; CHECK-LABEL: strided_vpload_v32f64_allones_mask:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a4, 16
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB46_2
+; CHECK-NEXT:    sltiu a3, a2, 17
+; CHECK-NEXT:    addi a4, a2, -16
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    li a5, 16
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    bltu a2, a5, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    addi a5, a2, -16
+; CHECK-NEXT:    mul a4, a2, a1
 ; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a5
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a4), a1
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vlse64.v v16, (a4), a1
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vlse64.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> splat (i1 true), i32 %evl)
@@ -549,10 +547,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    li a3, 32
 ; CHECK-RV32-NEXT:  .LBB47_2:
 ; CHECK-RV32-NEXT:    mul a6, a3, a2
-; CHECK-RV32-NEXT:    addi a5, a4, -32
-; CHECK-RV32-NEXT:    sltu a7, a4, a5
-; CHECK-RV32-NEXT:    addi a7, a7, -1
-; CHECK-RV32-NEXT:    and a7, a7, a5
+; CHECK-RV32-NEXT:    sltiu a5, a4, 33
+; CHECK-RV32-NEXT:    addi a7, a4, -32
+; CHECK-RV32-NEXT:    neg a5, a5
+; CHECK-RV32-NEXT:    and a7, a5, a7
 ; CHECK-RV32-NEXT:    li a5, 16
 ; CHECK-RV32-NEXT:    add a6, a1, a6
 ; CHECK-RV32-NEXT:    bltu a7, a5, .LBB47_4
@@ -563,10 +561,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    vslidedown.vi v0, v8, 4
 ; CHECK-RV32-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV32-NEXT:    addi a6, a3, -16
-; CHECK-RV32-NEXT:    sltu a3, a3, a6
-; CHECK-RV32-NEXT:    addi a3, a3, -1
-; CHECK-RV32-NEXT:    and a3, a3, a6
+; CHECK-RV32-NEXT:    sltiu a6, a3, 17
+; CHECK-RV32-NEXT:    neg a6, a6
+; CHECK-RV32-NEXT:    addi a3, a3, -16
+; CHECK-RV32-NEXT:    and a3, a6, a3
 ; CHECK-RV32-NEXT:    bltu a4, a5, .LBB47_6
 ; CHECK-RV32-NEXT:  # %bb.5:
 ; CHECK-RV32-NEXT:    li a4, 16
@@ -600,10 +598,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    li a4, 32
 ; CHECK-RV64-NEXT:  .LBB47_2:
 ; CHECK-RV64-NEXT:    mul a6, a4, a2
-; CHECK-RV64-NEXT:    addi a5, a3, -32
-; CHECK-RV64-NEXT:    sltu a7, a3, a5
-; CHECK-RV64-NEXT:    addi a7, a7, -1
-; CHECK-RV64-NEXT:    and a7, a7, a5
+; CHECK-RV64-NEXT:    sltiu a5, a3, 33
+; CHECK-RV64-NEXT:    addi a7, a3, -32
+; CHECK-RV64-NEXT:    neg a5, a5
+; CHECK-RV64-NEXT:    and a7, a5, a7
 ; CHECK-RV64-NEXT:    li a5, 16
 ; CHECK-RV64-NEXT:    add a6, a1, a6
 ; CHECK-RV64-NEXT:    bltu a7, a5, .LBB47_4
@@ -614,10 +612,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    vslidedown.vi v0, v8, 4
 ; CHECK-RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV64-NEXT:    addi a6, a4, -16
-; CHECK-RV64-NEXT:    sltu a4, a4, a6
-; CHECK-RV64-NEXT:    addi a4, a4, -1
-; CHECK-RV64-NEXT:    and a4, a4, a6
+; CHECK-RV64-NEXT:    sltiu a6, a4, 17
+; CHECK-RV64-NEXT:    neg a6, a6
+; CHECK-RV64-NEXT:    addi a4, a4, -16
+; CHECK-RV64-NEXT:    and a4, a6, a4
 ; CHECK-RV64-NEXT:    bltu a3, a5, .LBB47_6
 ; CHECK-RV64-NEXT:  # %bb.5:
 ; CHECK-RV64-NEXT:    li a3, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
index 25624ea0fcf6c..c7edae931a126 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
@@ -411,14 +411,14 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid
 ; CHECK-NEXT:  .LBB38_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT:    sltiu a4, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
 ; CHECK-NEXT:    mul a3, a3, a1
-; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    addi a3, a2, -16
-; CHECK-NEXT:    sltu a2, a2, a3
-; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    neg a4, a4
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1, v0.t
 ; CHECK-NEXT:    ret
@@ -437,12 +437,12 @@ define void @strided_store_v32f64_allones_mask(<32 x double> %v, ptr %ptr, i32 s
 ; CHECK-NEXT:  .LBB39_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-NEXT:    sltiu a4, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
 ; CHECK-NEXT:    mul a3, a3, a1
+; CHECK-NEXT:    neg a4, a4
+; CHECK-NEXT:    and a2, a4, a2
 ; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    addi a3, a2, -16
-; CHECK-NEXT:    sltu a2, a2, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index f992d1f8f7eee..f69a4ffde7910 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -56,10 +56,10 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero
 ; CHECK-NEXT:  .LBB4_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT:    addi a2, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a2
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    sltiu a2, a0, 65
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a0, a2, a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
@@ -214,79 +214,85 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vslidedown.vi v5, v0, 8
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v4, v0, 4
-; RV32-NEXT:    addi a2, a7, -64
-; RV32-NEXT:    vslidedown.vi v3, v5, 4
-; RV32-NEXT:    sltu a3, a7, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a4, a3, a2
-; RV32-NEXT:    addi a2, a4, -32
-; RV32-NEXT:    sltu a3, a4, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
+; RV32-NEXT:    sltiu a2, a7, 65
+; RV32-NEXT:    addi a3, a7, -64
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    and a4, a4, a3
+; RV32-NEXT:    sltiu a2, a4, 33
+; RV32-NEXT:    addi a3, a4, -32
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and t1, a2, a3
 ; RV32-NEXT:    li a2, 16
-; RV32-NEXT:    addi t0, a3, -16
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    bltu a3, a2, .LBB16_2
+; RV32-NEXT:    vslidedown.vi v3, v5, 4
+; RV32-NEXT:    mv a5, t1
+; RV32-NEXT:    bltu t1, a2, .LBB16_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a5, 16
 ; RV32-NEXT:  .LBB16_2:
-; RV32-NEXT:    li t2, 64
-; RV32-NEXT:    sltu t1, a3, t0
+; RV32-NEXT:    li t0, 64
+; RV32-NEXT:    sltiu a3, t1, 17
 ; RV32-NEXT:    mv a6, a7
-; RV32-NEXT:    bltu a7, t2, .LBB16_4
+; RV32-NEXT:    bltu a7, t0, .LBB16_4
 ; RV32-NEXT:  # %bb.3:
 ; RV32-NEXT:    li a6, 64
 ; RV32-NEXT:  .LBB16_4:
 ; RV32-NEXT:    addi t2, a1, 128
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v6, v4, 2
-; RV32-NEXT:    addi t6, a1, 512
-; RV32-NEXT:    addi t5, a1, 640
+; RV32-NEXT:    addi t5, a1, 512
+; RV32-NEXT:    addi t4, a1, 640
 ; RV32-NEXT:    vslidedown.vi v0, v3, 2
-; RV32-NEXT:    addi t1, t1, -1
+; RV32-NEXT:    neg t0, a3
+; RV32-NEXT:    addi t1, t1, -16
 ; RV32-NEXT:    addi t3, a1, 384
 ; RV32-NEXT:    vslidedown.vi v2, v5, 2
 ; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    addi t4, a6, -32
-; RV32-NEXT:    sltu a6, a6, t4
-; RV32-NEXT:    addi a6, a6, -1
-; RV32-NEXT:    and a6, a6, t4
-; RV32-NEXT:    addi t4, a6, -16
-; RV32-NEXT:    sltu s0, a6, t4
-; RV32-NEXT:    addi s0, s0, -1
+; RV32-NEXT:    sltiu t6, a6, 33
+; RV32-NEXT:    addi a6, a6, -32
+; RV32-NEXT:    neg t6, t6
+; RV32-NEXT:    and a6, t6, a6
+; RV32-NEXT:    sltiu t6, a6, 17
+; RV32-NEXT:    neg t6, t6
+; RV32-NEXT:    addi s0, a6, -16
 ; RV32-NEXT:    bltu a6, a2, .LBB16_6
 ; RV32-NEXT:  # %bb.5:
 ; RV32-NEXT:    li a6, 16
 ; RV32-NEXT:  .LBB16_6:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v8, (t6)
-; RV32-NEXT:    csrr t6, vlenb
+; RV32-NEXT:    vle64.v v8, (t5)
+; RV32-NEXT:    csrr t5, vlenb
 ; RV32-NEXT:    sw a0, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    mul t6, t6, a0
+; RV32-NEXT:    mul t5, t5, a0
 ; RV32-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    add t6, sp, t6
-; RV32-NEXT:    addi t6, t6, 16
-; RV32-NEXT:    vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    vle64.v v8, (t5)
-; RV32-NEXT:    vle64.v v16, (t2)
+; RV32-NEXT:    add t5, sp, t5
+; RV32-NEXT:    addi t5, t5, 16
+; RV32-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vle64.v v16, (t4)
+; RV32-NEXT:    vle64.v v8, (t2)
+; RV32-NEXT:    csrr t2, vlenb
+; RV32-NEXT:    li t4, 40
+; RV32-NEXT:    mul t2, t2, t4
+; RV32-NEXT:    add t2, sp, t2
+; RV32-NEXT:    addi t2, t2, 16
+; RV32-NEXT:    vs8r.v v8, (t2) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vle64.v v24, (a1)
 ; RV32-NEXT:    csrr t2, vlenb
-; RV32-NEXT:    li t5, 48
-; RV32-NEXT:    mul t2, t2, t5
+; RV32-NEXT:    li t4, 48
+; RV32-NEXT:    mul t2, t2, t4
 ; RV32-NEXT:    add t2, sp, t2
 ; RV32-NEXT:    addi t2, t2, 16
 ; RV32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -296,8 +302,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    add t2, sp, t2
 ; RV32-NEXT:    addi t2, t2, 16
 ; RV32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    and t2, t1, t0
-; RV32-NEXT:    and t1, s0, t4
+; RV32-NEXT:    and t2, t0, t1
+; RV32-NEXT:    and t1, t6, s0
 ; RV32-NEXT:    addi a1, a1, 256
 ; RV32-NEXT:    mv t0, a4
 ; RV32-NEXT:    bltu a4, a3, .LBB16_8
@@ -305,45 +311,45 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    li t0, 32
 ; RV32-NEXT:  .LBB16_8:
 ; RV32-NEXT:    vsetvli zero, t2, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v24, v8, 0, v0.t
-; RV32-NEXT:    csrr t2, vlenb
-; RV32-NEXT:    li t3, 24
-; RV32-NEXT:    mul t2, t2, t3
-; RV32-NEXT:    add t2, sp, t2
-; RV32-NEXT:    addi t2, t2, 16
-; RV32-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr t2, vlenb
 ; RV32-NEXT:    li t3, 56
 ; RV32-NEXT:    mul t2, t2, t3
 ; RV32-NEXT:    add t2, sp, t2
 ; RV32-NEXT:    addi t2, t2, 16
-; RV32-NEXT:    vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v8, (t2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v8, v24, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v6
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    li t2, 40
+; RV32-NEXT:    mul a5, a5, t2
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, t1, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a5, t0, -16
-; RV32-NEXT:    sltu t0, t0, a5
-; RV32-NEXT:    addi t0, t0, -1
-; RV32-NEXT:    and a5, t0, a5
+; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    sltiu a5, t0, 17
+; RV32-NEXT:    addi t0, t0, -16
+; RV32-NEXT:    neg a5, a5
+; RV32-NEXT:    and a5, a5, t0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a1)
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v30, v7, 2
+; RV32-NEXT:    vslidedown.vi v28, v7, 2
 ; RV32-NEXT:    vmv1r.v v0, v4
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li t0, 48
@@ -364,9 +370,15 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    bltu a4, a2, .LBB16_10
 ; RV32-NEXT:  # %bb.9:
 ; RV32-NEXT:    li a4, 16
@@ -375,32 +387,33 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v24, v8, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    mv a1, a7
 ; RV32-NEXT:    bltu a7, a3, .LBB16_12
 ; RV32-NEXT:  # %bb.11:
 ; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:  .LBB16_12:
-; RV32-NEXT:    vmv1r.v v0, v30
+; RV32-NEXT:    vmv1r.v v0, v28
+; RV32-NEXT:    vmv4r.v v8, v24
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -417,7 +430,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    addi a4, a1, -16
+; RV32-NEXT:    sltiu a4, a1, 17
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    li a6, 56
 ; RV32-NEXT:    mul a5, a5, a6
@@ -438,7 +451,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 24
+; RV32-NEXT:    li a6, 40
 ; RV32-NEXT:    mul a5, a5, a6
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
@@ -450,11 +463,12 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    sltu a1, a1, a4
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a4
+; RV32-NEXT:    neg a4, a4
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    and a1, a4, a1
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 5
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
@@ -466,35 +480,34 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV32-NEXT:  .LBB16_14:
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, a7, e32, m4, ta, ma
-; RV32-NEXT:    vnsrl.wi v24, v16, 0, v0.t
+; RV32-NEXT:    vnsrl.wi v16, v24, 0, v0.t
 ; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vslideup.vi v24, v8, 16
-; RV32-NEXT:    vse32.v v24, (a0)
-; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vslideup.vi v16, v8, 16
+; RV32-NEXT:    vse32.v v16, (a0)
+; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    addi a1, a0, 128
+; RV32-NEXT:    addi a1, a0, 384
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 6
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    addi a0, a0, 384
+; RV32-NEXT:    addi a0, a0, 256
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    li a2, 48
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
@@ -537,66 +550,66 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    vslidedown.vi v5, v0, 8
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v4, v0, 4
-; RV64-NEXT:    addi a2, a7, -64
-; RV64-NEXT:    vslidedown.vi v3, v5, 4
-; RV64-NEXT:    sltu a3, a7, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a4, a3, a2
-; RV64-NEXT:    addi a2, a4, -32
-; RV64-NEXT:    sltu a3, a4, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
+; RV64-NEXT:    sltiu a2, a7, 65
+; RV64-NEXT:    addi a3, a7, -64
+; RV64-NEXT:    neg a4, a2
+; RV64-NEXT:    and a4, a4, a3
+; RV64-NEXT:    sltiu a2, a4, 33
+; RV64-NEXT:    addi a3, a4, -32
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and t1, a2, a3
 ; RV64-NEXT:    li a2, 16
-; RV64-NEXT:    addi t0, a3, -16
-; RV64-NEXT:    mv a5, a3
-; RV64-NEXT:    bltu a3, a2, .LBB16_2
+; RV64-NEXT:    vslidedown.vi v3, v5, 4
+; RV64-NEXT:    mv a5, t1
+; RV64-NEXT:    bltu t1, a2, .LBB16_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a5, 16
 ; RV64-NEXT:  .LBB16_2:
-; RV64-NEXT:    li t2, 64
-; RV64-NEXT:    sltu t1, a3, t0
+; RV64-NEXT:    li t0, 64
+; RV64-NEXT:    sltiu a3, t1, 17
 ; RV64-NEXT:    mv a6, a7
-; RV64-NEXT:    bltu a7, t2, .LBB16_4
+; RV64-NEXT:    bltu a7, t0, .LBB16_4
 ; RV64-NEXT:  # %bb.3:
 ; RV64-NEXT:    li a6, 64
 ; RV64-NEXT:  .LBB16_4:
 ; RV64-NEXT:    addi t2, a1, 128
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v6, v4, 2
-; RV64-NEXT:    addi t6, a1, 512
-; RV64-NEXT:    addi t5, a1, 640
+; RV64-NEXT:    addi t5, a1, 512
+; RV64-NEXT:    addi t4, a1, 640
 ; RV64-NEXT:    vslidedown.vi v0, v3, 2
-; RV64-NEXT:    addi t1, t1, -1
+; RV64-NEXT:    neg t0, a3
+; RV64-NEXT:    addi t1, t1, -16
 ; RV64-NEXT:    addi t3, a1, 384
 ; RV64-NEXT:    vslidedown.vi v2, v5, 2
 ; RV64-NEXT:    li a3, 32
-; RV64-NEXT:    addi t4, a6, -32
-; RV64-NEXT:    sltu a6, a6, t4
-; RV64-NEXT:    addi a6, a6, -1
-; RV64-NEXT:    and a6, a6, t4
-; RV64-NEXT:    addi t4, a6, -16
-; RV64-NEXT:    sltu s0, a6, t4
-; RV64-NEXT:    addi s0, s0, -1
+; RV64-NEXT:    sltiu t6, a6, 33
+; RV64-NEXT:    addi a6, a6, -32
+; RV64-NEXT:    neg t6, t6
+; RV64-NEXT:    and a6, t6, a6
+; RV64-NEXT:    sltiu t6, a6, 17
+; RV64-NEXT:    neg t6, t6
+; RV64-NEXT:    addi s0, a6, -16
 ; RV64-NEXT:    bltu a6, a2, .LBB16_6
 ; RV64-NEXT:  # %bb.5:
 ; RV64-NEXT:    li a6, 16
 ; RV64-NEXT:  .LBB16_6:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v8, (t6)
-; RV64-NEXT:    csrr t6, vlenb
+; RV64-NEXT:    vle64.v v8, (t5)
+; RV64-NEXT:    csrr t5, vlenb
 ; RV64-NEXT:    sd a0, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    mul t6, t6, a0
+; RV64-NEXT:    mul t5, t5, a0
 ; RV64-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    add t6, sp, t6
-; RV64-NEXT:    addi t6, t6, 32
-; RV64-NEXT:    vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    vle64.v v8, (t5)
+; RV64-NEXT:    add t5, sp, t5
+; RV64-NEXT:    addi t5, t5, 32
+; RV64-NEXT:    vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vle64.v v8, (t4)
 ; RV64-NEXT:    vle64.v v16, (t2)
 ; RV64-NEXT:    vle64.v v24, (a1)
 ; RV64-NEXT:    csrr t2, vlenb
-; RV64-NEXT:    li t5, 48
-; RV64-NEXT:    mul t2, t2, t5
+; RV64-NEXT:    li t4, 48
+; RV64-NEXT:    mul t2, t2, t4
 ; RV64-NEXT:    add t2, sp, t2
 ; RV64-NEXT:    addi t2, t2, 32
 ; RV64-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -606,8 +619,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add t2, sp, t2
 ; RV64-NEXT:    addi t2, t2, 32
 ; RV64-NEXT:    vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    and t2, t1, t0
-; RV64-NEXT:    and t1, s0, t4
+; RV64-NEXT:    and t2, t0, t1
+; RV64-NEXT:    and t1, t6, s0
 ; RV64-NEXT:    addi a1, a1, 256
 ; RV64-NEXT:    mv t0, a4
 ; RV64-NEXT:    bltu a4, a3, .LBB16_8
@@ -644,10 +657,10 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add a5, sp, a5
 ; RV64-NEXT:    addi a5, a5, 32
 ; RV64-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    addi a5, t0, -16
-; RV64-NEXT:    sltu t0, t0, a5
-; RV64-NEXT:    addi t0, t0, -1
-; RV64-NEXT:    and a5, t0, a5
+; RV64-NEXT:    sltiu a5, t0, 17
+; RV64-NEXT:    addi t0, t0, -16
+; RV64-NEXT:    neg a5, a5
+; RV64-NEXT:    and a5, a5, t0
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    addi a1, sp, 32
@@ -727,7 +740,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add a4, sp, a4
 ; RV64-NEXT:    addi a4, a4, 32
 ; RV64-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    addi a4, a1, -16
+; RV64-NEXT:    sltiu a4, a1, 17
 ; RV64-NEXT:    csrr a5, vlenb
 ; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    mul a5, a5, a6
@@ -760,9 +773,9 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    add a5, sp, a5
 ; RV64-NEXT:    addi a5, a5, 32
 ; RV64-NEXT:    vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    sltu a1, a1, a4
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a1, a1, a4
+; RV64-NEXT:    neg a4, a4
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    and a1, a4, a1
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    slli a4, a4, 5
 ; RV64-NEXT:    add a4, sp, a4
@@ -786,17 +799,17 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vslideup.vi v24, v8, 16
 ; RV64-NEXT:    vse32.v v24, (a0)
-; RV64-NEXT:    addi a1, a0, 256
+; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 48
+; RV64-NEXT:    li a3, 56
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 32
 ; RV64-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vse32.v v8, (a1)
-; RV64-NEXT:    addi a1, a0, 128
+; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    li a3, 48
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 32
@@ -837,10 +850,10 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext
 ; CHECK-NEXT:  .LBB17_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
index 3d1febe95421f..cde3f21947824 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vuitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vuitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 96dff2464e501..3fc3b47113a32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -354,10 +354,10 @@ define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltiu a0, a1, 129
+; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -383,10 +383,10 @@ define <256 x i8> @vadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
@@ -1328,10 +1328,10 @@ define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:  .LBB108_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1, v0.t
@@ -1351,10 +1351,10 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index da26c63b61e34..f2e051ee41ccb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -453,10 +453,10 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -488,10 +488,10 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v8, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v16, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
index 2774aba974a29..12c7009e43a44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
@@ -621,10 +621,10 @@ define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v16, v0.t
@@ -644,10 +644,10 @@ define <32 x double> @vfabs_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index f28b970f48ff7..e863e141376e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -855,10 +855,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -898,27 +898,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a2)
 ; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vle64.v v24, (a2)
 ; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
@@ -927,31 +921,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB51_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v8, v16
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vfmadd.vv v24, v8, v16
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index 403d0b8d57940..484389e29bed9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 56f7a8d48c5a1..92564e229bccc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index a9857880b5942..5298b186f2d25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -627,10 +627,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a2, 24
@@ -670,27 +670,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a2)
 ; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vle64.v v24, (a2)
 ; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
@@ -699,31 +693,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB51_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v0, v8, v16
-; CHECK-NEXT:    addi a0, a4, -16
-; CHECK-NEXT:    sltu a1, a4, a0
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    sltiu a0, a4, 17
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    addi a4, a4, -16
+; CHECK-NEXT:    and a0, a0, a4
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vfmadd.vv v24, v8, v16
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
index 84a89b23bc3b5..2b09bd9a22b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
@@ -589,10 +589,10 @@ define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v16, v16, v0.t
@@ -612,10 +612,10 @@ define <32 x double> @vfneg_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
index b431d4873fa1b..9f72f786591a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
@@ -361,10 +361,10 @@ define <32 x double> @vfsqrt_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zero
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v8, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
@@ -384,10 +384,10 @@ define <32 x double> @vfsqrt_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v8, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index f5978de080082..aa7c3d5e113d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmax_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmax.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 7450a70df66ba..3d6dc76d5e70d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vmaxu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmaxu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index 31d19304c2909..5000bea58fa36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmin_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmin.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index dda69ec8a7d2e..42b05a295e50e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a1)
-; CHECK-NEXT:    addi a1, a2, -128
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a1, a4, a1
+; CHECK-NEXT:    sltiu a1, a2, 129
+; CHECK-NEXT:    addi a4, a2, -128
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vminu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
-; CHECK-NEXT:    addi a2, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    sltiu a2, a1, 129
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    addi a1, a1, -128
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v16, v16, a0
 ; CHECK-NEXT:    ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vminu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 3f5751aaa2cad..071a726604787 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -285,16 +285,16 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %
 ; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
-; RV64-NEXT:    addi a2, a1, -16
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
-; RV64-NEXT:    sltu a1, a1, a2
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    li a0, 32
@@ -1997,12 +1997,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV32-NEXT:  .LBB94_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v24, (zero), v8, v0.t
-; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltiu a1, a0, 17
+; RV32-NEXT:    addi a0, a0, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2020,12 +2020,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV64-NEXT:  .LBB94_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (zero), v8, v0.t
-; RV64-NEXT:    addi a1, a0, -16
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sltiu a1, a0, 17
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (zero), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2048,12 +2048,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV32-NEXT:  .LBB95_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2077,12 +2077,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV64-NEXT:  .LBB95_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2106,12 +2106,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:  .LBB96_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2136,12 +2136,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:  .LBB96_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2168,12 +2168,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2194,12 +2194,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2226,12 +2226,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2253,12 +2253,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV64-NEXT:  .LBB98_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2284,12 +2284,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2312,12 +2312,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:  .LBB99_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2344,12 +2344,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2370,12 +2370,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2399,12 +2399,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV32-NEXT:  .LBB101_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2427,12 +2427,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV64-NEXT:  .LBB101_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2455,12 +2455,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:  .LBB102_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2483,12 +2483,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:  .LBB102_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2512,12 +2512,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:  .LBB103_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2540,12 +2540,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:  .LBB103_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2575,12 +2575,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a1, a1, a2
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sltiu a2, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
+; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2598,12 +2598,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV64-NEXT:  .LBB104_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a2, a1, 17
+; RV64-NEXT:    addi a1, a1, -16
+; RV64-NEXT:    neg a2, a2
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (a0), v16, v0.t
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index d058669c103f3..8e50dffcaf31c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -325,12 +325,12 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB31_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
-; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    sltiu a2, a1, 17
+; CHECK-NEXT:    addi a1, a1, -16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0), v0.t
@@ -352,15 +352,15 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:  .LBB32_2:
-; CHECK-NEXT:    addi a5, a3, -16
+; CHECK-NEXT:    sltiu a5, a3, 17
+; CHECK-NEXT:    addi a3, a3, -16
 ; CHECK-NEXT:    addi a4, a1, 128
-; CHECK-NEXT:    addi a7, a2, -32
-; CHECK-NEXT:    sltu a3, a3, a5
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a6, a3, a5
-; CHECK-NEXT:    sltu a3, a2, a7
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a5, a3, a7
+; CHECK-NEXT:    sltiu a7, a2, 33
+; CHECK-NEXT:    neg a5, a5
+; CHECK-NEXT:    and a6, a5, a3
+; CHECK-NEXT:    addi a3, a2, -32
+; CHECK-NEXT:    neg a5, a7
+; CHECK-NEXT:    and a5, a5, a3
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v8, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index 0bacb5c26cb4a..3a36cda6dd04a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -1306,12 +1306,12 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
 ; CHECK-NEXT:  .LBB83_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
@@ -1339,12 +1339,12 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1>
 ; CHECK-NEXT:  .LBB84_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, tu, ma
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
 ; CHECK-NEXT:    vfmerge.vfm v16, v16, fa0, v0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index b4d20d93f2a1c..e509b390a3067 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1703,12 +1703,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV32-NEXT:  .LBB83_2:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
-; RV32-NEXT:    addi a0, a1, -16
+; RV32-NEXT:    sltiu a0, a1, 17
+; RV32-NEXT:    addi a1, a1, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a1, a1, a0
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1737,12 +1737,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV64-NEXT:  .LBB83_2:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
-; RV64-NEXT:    addi a0, a2, -16
-; RV64-NEXT:    sltu a1, a2, a0
-; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    sltiu a0, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1773,12 +1773,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV32-NEXT:  .LBB84_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    sltiu a1, a2, 17
+; RV32-NEXT:    addi a2, a2, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a2, a2, a1
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1819,12 +1819,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT:    addi a1, a2, -16
-; RV64-NEXT:    sltu a2, a2, a1
-; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    sltiu a1, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
@@ -1859,12 +1859,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:  .LBB85_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    sltiu a1, a2, 17
+; RV32-NEXT:    addi a2, a2, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a2, a2, a1
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1905,12 +1905,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT:    addi a1, a2, -16
-; RV64-NEXT:    sltu a2, a2, a1
-; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    sltiu a1, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
@@ -1946,12 +1946,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:  .LBB86_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    sltiu a1, a2, 17
+; RV32-NEXT:    addi a2, a2, -16
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    sltu a2, a2, a1
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1992,12 +1992,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT:    addi a1, a2, -16
-; RV64-NEXT:    sltu a2, a2, a1
-; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    sltiu a1, a2, 17
+; RV64-NEXT:    addi a2, a2, -16
+; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
-; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index 855a87d21b7dc..b4e402caf5ba4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -255,12 +255,12 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
 ; CHECK-NEXT:  .LBB24_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
-; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    sltiu a2, a1, 17
+; CHECK-NEXT:    addi a1, a1, -16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    sltu a1, a1, a2
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index acaa1e6fa002d..495049e51fb64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -363,10 +363,10 @@ define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltiu a0, a1, 129
+; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -392,10 +392,10 @@ define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
@@ -1335,10 +1335,10 @@ define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:  .LBB108_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
@@ -1358,10 +1358,10 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v16, v16, -1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index 9b3b8348d9b30..a5f57c24aaaaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -359,10 +359,10 @@ define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    sltiu a0, a1, 129
+; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB32_2
@@ -388,10 +388,10 @@ define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1
 ; CHECK-NEXT:    ret
@@ -1331,10 +1331,10 @@ define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:  .LBB108_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
@@ -1354,10 +1354,10 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:  .LBB109_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v16, v16, -1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index f2f9f90f386c0..e91477a622b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -143,15 +143,15 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
 ; CHECK-NEXT:    vmv1r.v v6, v8
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    addi a0, a3, -128
-; CHECK-NEXT:    vle8.v v8, (a4)
-; CHECK-NEXT:    sltu a4, a3, a0
+; CHECK-NEXT:    addi a0, a1, 128
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    sltiu a0, a3, 129
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a0, a4, a0
+; CHECK-NEXT:    addi a1, a3, -128
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
@@ -342,12 +342,12 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
 ; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    addi a0, a2, -16
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    sltiu a0, a2, 17
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -511,12 +511,12 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    addi a0, a2, -32
-; CHECK-NEXT:    sltu a1, a2, a0
-; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    sltiu a0, a2, 33
+; CHECK-NEXT:    addi a2, a2, -32
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 4
-; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 4c7d312e8e785..0947e39ce87e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -373,12 +373,12 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    sltiu a3, a1, 129
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    sltu a0, a1, a3
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a3, a3, a0
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
@@ -406,10 +406,10 @@ define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a2
 ; CHECK-NEXT:    ret
@@ -1376,10 +1376,10 @@ define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a2, v0.t
@@ -1400,10 +1400,10 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index f9000a1b88a6d..12fef2f06bfcf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -368,12 +368,12 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
-; CHECK-NEXT:    addi a3, a1, -128
+; CHECK-NEXT:    sltiu a3, a1, 129
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    sltu a0, a1, a3
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    neg a3, a3
+; CHECK-NEXT:    and a3, a3, a0
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
@@ -401,10 +401,10 @@ define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -128
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 129
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -128
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a2
 ; CHECK-NEXT:    ret
@@ -1371,10 +1371,10 @@ define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a2, v0.t
@@ -1395,10 +1395,10 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a2, -1
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v8, v8, a2
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
index e2d9e0ac2deea..0bdbf1bb54074 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vzext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
 ; CHECK-NEXT:  .LBB12_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v16, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vzext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
 ; CHECK-NEXT:  .LBB13_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v24, v8
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    sltiu a1, a0, 17
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 16
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index e2deefa26ecb3..0ed12ddbb0f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 0e0c92b150d33..33ae7ca7d7847 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 86ed239e99373..173ea25335375 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index 736dd1225da88..cbccc96f43cbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -958,7 +958,7 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a6, a2, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a6)
-; CHECK-NEXT:    sltu a6, a4, a5
+; CHECK-NEXT:    sltu a6, a1, a4
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    srli a6, a1, 3
@@ -1059,7 +1059,7 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    add a3, a2, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    sltu a3, a4, a6
+; CHECK-NEXT:    sltu a3, a1, a4
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a6, a3, a6
 ; CHECK-NEXT:    li a3, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
index c0a794afac3ae..c9478d65058f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -57,7 +57,7 @@ define <vscale x 16 x i64> @llrint_nxv16i64_nxv16f32(<vscale x 16 x float> %x, <
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
index c09df1a60d2ae..4136bab37bc9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -119,7 +119,7 @@ define <vscale x 16 x iXLen> @lrint_nxv16f32(<vscale x 16 x float> %x, <vscale x
 ; RV64-i64-NEXT:    srli a2, a1, 3
 ; RV64-i64-NEXT:    sub a3, a0, a1
 ; RV64-i64-NEXT:    vslidedown.vx v0, v0, a2
-; RV64-i64-NEXT:    sltu a2, a0, a3
+; RV64-i64-NEXT:    sltu a2, a1, a0
 ; RV64-i64-NEXT:    addi a2, a2, -1
 ; RV64-i64-NEXT:    and a2, a2, a3
 ; RV64-i64-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 67e7f7c7fbd42..236ba9096f4f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
index 1ee7e138654b9..3e9c669106a26 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
@@ -24274,7 +24274,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24302,7 +24302,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24335,7 +24335,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24363,7 +24363,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24396,7 +24396,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24424,7 +24424,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24457,7 +24457,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24485,7 +24485,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24517,7 +24517,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24545,7 +24545,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
@@ -24586,10 +24586,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24614,10 +24614,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24647,10 +24647,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24675,10 +24675,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24708,10 +24708,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24736,10 +24736,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24769,10 +24769,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24797,10 +24797,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24829,10 +24829,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v9, (zero), v24
@@ -24857,10 +24857,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v9, (zero), v24
@@ -25538,7 +25538,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25566,7 +25566,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25599,7 +25599,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25627,7 +25627,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25660,7 +25660,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25688,7 +25688,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25721,7 +25721,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25749,7 +25749,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25781,7 +25781,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25809,7 +25809,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
@@ -25850,10 +25850,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25878,10 +25878,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25911,10 +25911,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25939,10 +25939,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -25972,10 +25972,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26000,10 +26000,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26033,10 +26033,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26061,10 +26061,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26093,10 +26093,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26121,10 +26121,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v24
@@ -26802,7 +26802,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26829,7 +26829,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26861,7 +26861,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26888,7 +26888,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26920,7 +26920,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26947,7 +26947,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -26979,7 +26979,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27006,7 +27006,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27037,7 +27037,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27064,7 +27064,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -27104,10 +27104,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27132,10 +27132,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27165,10 +27165,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27193,10 +27193,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27226,10 +27226,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27254,10 +27254,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27287,10 +27287,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27315,10 +27315,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27347,10 +27347,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -27375,10 +27375,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28056,7 +28056,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28083,7 +28083,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28115,7 +28115,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28142,7 +28142,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28174,7 +28174,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28201,7 +28201,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28233,7 +28233,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28260,7 +28260,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28291,7 +28291,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
 ; CHECK-RV64V:       # %bb.0:
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
 ; CHECK-RV64V-NEXT:    sub a2, a0, a1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28318,7 +28318,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
 ; CHECK-RV64VC:       # %bb.0:
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV64VC-NEXT:    sub a2, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -28358,10 +28358,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28386,10 +28386,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28419,10 +28419,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28447,10 +28447,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28480,10 +28480,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28508,10 +28508,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28541,10 +28541,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28569,10 +28569,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28601,10 +28601,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT:    sub a0, a1, a0
-; CHECK-RV64V-NEXT:    sltu a1, a1, a0
-; CHECK-RV64V-NEXT:    addi a1, a1, -1
-; CHECK-RV64V-NEXT:    and a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a2, a1, a0
+; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
@@ -28629,10 +28629,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT:    sub a0, a1, a0
-; CHECK-RV64VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV64VC-NEXT:    addi a1, a1, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a1, a0
+; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
@@ -29322,12 +29322,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB850_2
@@ -29345,7 +29345,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB850_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29374,7 +29374,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29406,12 +29406,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB850_2
@@ -29429,7 +29429,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB850_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29458,7 +29458,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29495,12 +29495,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB851_2
@@ -29518,7 +29518,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB851_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29547,7 +29547,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29579,12 +29579,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB851_2
@@ -29602,7 +29602,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB851_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29631,7 +29631,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29668,12 +29668,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB852_2
@@ -29691,7 +29691,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB852_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29720,7 +29720,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29752,12 +29752,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB852_2
@@ -29775,7 +29775,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB852_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29804,7 +29804,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29841,12 +29841,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB853_2
@@ -29864,7 +29864,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB853_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29893,7 +29893,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -29925,12 +29925,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB853_2
@@ -29948,7 +29948,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB853_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -29977,7 +29977,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -30013,12 +30013,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB854_2
@@ -30036,7 +30036,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB854_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30065,7 +30065,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -30097,12 +30097,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB854_2
@@ -30120,7 +30120,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB854_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30149,7 +30149,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
@@ -30226,13 +30226,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30244,10 +30244,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30277,10 +30277,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30342,13 +30342,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30360,10 +30360,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30393,10 +30393,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30463,13 +30463,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30481,10 +30481,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30514,10 +30514,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30579,13 +30579,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30597,10 +30597,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30630,10 +30630,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30700,13 +30700,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30718,10 +30718,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30751,10 +30751,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30816,13 +30816,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30834,10 +30834,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30867,10 +30867,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -30937,13 +30937,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -30955,10 +30955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -30988,10 +30988,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -31053,13 +31053,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -31071,10 +31071,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -31104,10 +31104,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -31173,13 +31173,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v17, (zero), v0
@@ -31191,10 +31191,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -31224,10 +31224,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v24
@@ -31289,13 +31289,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v16, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v17, (zero), v0
@@ -31307,10 +31307,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -31340,10 +31340,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v24
@@ -32026,12 +32026,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB880_2
@@ -32049,7 +32049,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB880_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32078,7 +32078,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32110,12 +32110,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB880_2
@@ -32133,7 +32133,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB880_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32162,7 +32162,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32199,12 +32199,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB881_2
@@ -32222,7 +32222,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB881_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32251,7 +32251,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32283,12 +32283,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB881_2
@@ -32306,7 +32306,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB881_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32335,7 +32335,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32372,12 +32372,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB882_2
@@ -32395,7 +32395,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB882_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32424,7 +32424,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32456,12 +32456,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB882_2
@@ -32479,7 +32479,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB882_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32508,7 +32508,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32545,12 +32545,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB883_2
@@ -32568,7 +32568,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB883_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32597,7 +32597,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32629,12 +32629,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB883_2
@@ -32652,7 +32652,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB883_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32681,7 +32681,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32717,12 +32717,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64V-NEXT:    slli a3, a1, 1
 ; CHECK-RV64V-NEXT:    add a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a2, a3
+; CHECK-RV64V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64V-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT:    sltu a0, a2, a4
-; CHECK-RV64V-NEXT:    addi a0, a0, -1
+; CHECK-RV64V-NEXT:    addi a0, a5, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a4
 ; CHECK-RV64V-NEXT:    sub a4, a0, a1
-; CHECK-RV64V-NEXT:    sltu a5, a0, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
 ; CHECK-RV64V-NEXT:    and a4, a5, a4
 ; CHECK-RV64V-NEXT:    bltu a0, a1, .LBB884_2
@@ -32740,7 +32740,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64V-NEXT:    mv a2, a3
 ; CHECK-RV64V-NEXT:  .LBB884_4:
 ; CHECK-RV64V-NEXT:    sub a0, a2, a1
-; CHECK-RV64V-NEXT:    sltu a3, a2, a0
+; CHECK-RV64V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a0, a3, a0
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32769,7 +32769,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    slli a1, a1, 1
 ; CHECK-RV32V-NEXT:    sub a2, a0, a1
-; CHECK-RV32V-NEXT:    sltu a3, a0, a2
+; CHECK-RV32V-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
 ; CHECK-RV32V-NEXT:    and a2, a3, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32801,12 +32801,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64VC-NEXT:    slli a3, a1, 1
 ; CHECK-RV64VC-NEXT:    add a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a2, a3
+; CHECK-RV64VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT:    sltu a0, a2, a4
-; CHECK-RV64VC-NEXT:    addi a0, a0, -1
+; CHECK-RV64VC-NEXT:    addi a0, a5, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a4
 ; CHECK-RV64VC-NEXT:    sub a4, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a5, a0, a4
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a5
 ; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB884_2
@@ -32824,7 +32824,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV64VC-NEXT:    mv a2, a3
 ; CHECK-RV64VC-NEXT:  .LBB884_4:
 ; CHECK-RV64VC-NEXT:    sub a0, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a3, a2, a0
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a3
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32853,7 +32853,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
 ; CHECK-RV32VC-NEXT:    sub a2, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV32VC-NEXT:    and a2, a2, a3
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -32907,13 +32907,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -32925,10 +32925,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -32955,10 +32955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -32997,13 +32997,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33015,10 +33015,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33045,10 +33045,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33092,13 +33092,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33110,10 +33110,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33140,10 +33140,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33182,13 +33182,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33200,10 +33200,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33230,10 +33230,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33277,13 +33277,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33295,10 +33295,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33325,10 +33325,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33367,13 +33367,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33385,10 +33385,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33415,10 +33415,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33462,13 +33462,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33480,10 +33480,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33510,10 +33510,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33552,13 +33552,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33570,10 +33570,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33600,10 +33600,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33646,13 +33646,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, a1
-; CHECK-RV64V-NEXT:    sub a2, a3, a2
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a2
+; CHECK-RV64V-NEXT:    sltu a4, a1, a4
+; CHECK-RV64V-NEXT:    sub a5, a3, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
-; CHECK-RV64V-NEXT:    and a0, a3, a2
+; CHECK-RV64V-NEXT:    and a0, a2, a5
 ; CHECK-RV64V-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33664,10 +33664,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a2, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a0, a0, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a2
 ; CHECK-RV64V-NEXT:    addi a1, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33694,10 +33694,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT:    sub a0, a1, a0
-; CHECK-RV32V-NEXT:    sltu a1, a1, a0
-; CHECK-RV32V-NEXT:    addi a1, a1, -1
-; CHECK-RV32V-NEXT:    and a0, a1, a0
+; CHECK-RV32V-NEXT:    sub a2, a1, a0
+; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    addi a0, a0, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
@@ -33736,13 +33736,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v8, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, a1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a2
-; CHECK-RV64VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV64VC-NEXT:    sub a5, a3, a2
+; CHECK-RV64VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    addi a3, a3, -1
+; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a4, a4, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a2
+; CHECK-RV64VC-NEXT:    and a0, a2, a5
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v10, (zero), v0
@@ -33754,10 +33754,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e16, m2, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    sub a1, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sub a2, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
-; CHECK-RV64VC-NEXT:    and a0, a0, a1
+; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -33784,10 +33784,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT:    sub a0, a1, a0
-; CHECK-RV32VC-NEXT:    sltu a1, a1, a0
-; CHECK-RV32VC-NEXT:    addi a1, a1, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a1, a0
+; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    addi a0, a0, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
@@ -34527,30 +34527,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB910_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB910_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB910_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB910_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB910_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB910_6
@@ -34560,26 +34560,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB910_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -34589,11 +34589,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB910_10
@@ -34619,7 +34619,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB910_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -34674,45 +34674,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB910_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB910_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB910_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB910_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB910_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB910_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB910_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB910_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB910_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -34800,33 +34800,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB910_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB910_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB910_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB910_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB910_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB910_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -34834,7 +34834,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -34842,7 +34842,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -34853,21 +34853,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB910_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB910_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB910_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB910_10
@@ -34893,7 +34893,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB910_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -34948,45 +34948,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB910_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB910_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB910_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB910_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB910_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB910_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB910_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB910_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB910_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35080,30 +35080,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB911_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB911_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB911_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB911_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB911_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB911_6
@@ -35113,26 +35113,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB911_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -35142,11 +35142,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB911_10
@@ -35172,7 +35172,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB911_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -35227,45 +35227,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB911_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB911_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB911_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB911_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB911_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB911_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB911_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB911_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB911_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35353,33 +35353,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB911_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB911_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB911_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB911_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB911_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB911_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -35387,7 +35387,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35395,7 +35395,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35406,21 +35406,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB911_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB911_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB911_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB911_10
@@ -35446,7 +35446,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB911_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -35501,45 +35501,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB911_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB911_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB911_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB911_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB911_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB911_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB911_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB911_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB911_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35633,30 +35633,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB912_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB912_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB912_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB912_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB912_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB912_6
@@ -35666,26 +35666,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB912_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -35695,11 +35695,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB912_10
@@ -35725,7 +35725,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB912_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -35780,45 +35780,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB912_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB912_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB912_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB912_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB912_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB912_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB912_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB912_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB912_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35906,33 +35906,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB912_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB912_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB912_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB912_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB912_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB912_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -35940,7 +35940,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35948,7 +35948,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -35959,21 +35959,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB912_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB912_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB912_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB912_10
@@ -35999,7 +35999,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB912_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -36054,45 +36054,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB912_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB912_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB912_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB912_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB912_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB912_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB912_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB912_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB912_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36186,30 +36186,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB913_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB913_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB913_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB913_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB913_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB913_6
@@ -36219,26 +36219,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB913_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -36248,11 +36248,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB913_10
@@ -36278,7 +36278,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB913_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -36333,45 +36333,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB913_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB913_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB913_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB913_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB913_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB913_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB913_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB913_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB913_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36459,33 +36459,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB913_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB913_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB913_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB913_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB913_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB913_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -36493,7 +36493,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -36501,7 +36501,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -36512,21 +36512,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB913_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB913_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB913_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB913_10
@@ -36552,7 +36552,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB913_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -36607,45 +36607,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB913_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB913_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB913_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB913_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB913_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB913_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB913_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB913_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB913_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36738,30 +36738,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    mv a0, s1
 ; CHECK-RV64V-NEXT:    call __muldi3
 ; CHECK-RV64V-NEXT:    slli a2, s1, 2
-; CHECK-RV64V-NEXT:    sub a1, s0, a2
-; CHECK-RV64V-NEXT:    sltu a3, s0, a1
-; CHECK-RV64V-NEXT:    addi a3, a3, -1
-; CHECK-RV64V-NEXT:    and a3, a3, a1
 ; CHECK-RV64V-NEXT:    slli a1, s1, 1
+; CHECK-RV64V-NEXT:    sub a3, s0, a2
+; CHECK-RV64V-NEXT:    sltu a4, a2, s0
+; CHECK-RV64V-NEXT:    addi a4, a4, -1
+; CHECK-RV64V-NEXT:    and a3, a4, a3
 ; CHECK-RV64V-NEXT:    sub a4, a3, a1
-; CHECK-RV64V-NEXT:    sltu a5, a3, a4
+; CHECK-RV64V-NEXT:    sltu a5, a1, a3
 ; CHECK-RV64V-NEXT:    addi a5, a5, -1
-; CHECK-RV64V-NEXT:    and a6, a5, a4
-; CHECK-RV64V-NEXT:    sub a4, a6, s1
-; CHECK-RV64V-NEXT:    mv a5, a6
-; CHECK-RV64V-NEXT:    bltu a6, s1, .LBB914_2
+; CHECK-RV64V-NEXT:    and a5, a5, a4
+; CHECK-RV64V-NEXT:    mv a4, a5
+; CHECK-RV64V-NEXT:    bltu a5, s1, .LBB914_2
 ; CHECK-RV64V-NEXT:  # %bb.1:
-; CHECK-RV64V-NEXT:    mv a5, s1
+; CHECK-RV64V-NEXT:    mv a4, s1
 ; CHECK-RV64V-NEXT:  .LBB914_2:
-; CHECK-RV64V-NEXT:    sltu a7, a6, a4
+; CHECK-RV64V-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64V-NEXT:    bltu a3, a1, .LBB914_4
 ; CHECK-RV64V-NEXT:  # %bb.3:
 ; CHECK-RV64V-NEXT:    mv a3, a1
 ; CHECK-RV64V-NEXT:  .LBB914_4:
 ; CHECK-RV64V-NEXT:    add a6, s2, a0
-; CHECK-RV64V-NEXT:    addi a0, a7, -1
+; CHECK-RV64V-NEXT:    sub a0, a5, s1
+; CHECK-RV64V-NEXT:    addi a5, a7, -1
 ; CHECK-RV64V-NEXT:    sub a7, a3, s1
-; CHECK-RV64V-NEXT:    sltu t0, a3, a7
+; CHECK-RV64V-NEXT:    sltu t0, s1, a3
 ; CHECK-RV64V-NEXT:    addi t0, t0, -1
 ; CHECK-RV64V-NEXT:    and a7, t0, a7
 ; CHECK-RV64V-NEXT:    bltu a3, s1, .LBB914_6
@@ -36771,26 +36771,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (a6)
 ; CHECK-RV64V-NEXT:    addi a6, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 3
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 3
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a7, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT:    csrr a5, vlenb
-; CHECK-RV64V-NEXT:    slli a5, a5, 4
-; CHECK-RV64V-NEXT:    add a5, sp, a5
-; CHECK-RV64V-NEXT:    addi a5, a5, 16
-; CHECK-RV64V-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    csrr a4, vlenb
+; CHECK-RV64V-NEXT:    slli a4, a4, 4
+; CHECK-RV64V-NEXT:    add a4, sp, a4
+; CHECK-RV64V-NEXT:    addi a4, a4, 16
+; CHECK-RV64V-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT:    and a0, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a5, a0
 ; CHECK-RV64V-NEXT:    bltu s0, a2, .LBB914_8
 ; CHECK-RV64V-NEXT:  # %bb.7:
 ; CHECK-RV64V-NEXT:    mv s0, a2
@@ -36800,11 +36800,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64V-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64V-NEXT:    sub a0, s0, a1
-; CHECK-RV64V-NEXT:    sltu a2, s0, a0
+; CHECK-RV64V-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    and a0, a2, a0
 ; CHECK-RV64V-NEXT:    sub a2, a0, s1
-; CHECK-RV64V-NEXT:    sltu a3, a0, a2
+; CHECK-RV64V-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a2, a3, a2
 ; CHECK-RV64V-NEXT:    bltu a0, s1, .LBB914_10
@@ -36830,7 +36830,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64V-NEXT:    mv s0, a1
 ; CHECK-RV64V-NEXT:  .LBB914_12:
 ; CHECK-RV64V-NEXT:    sub a0, s0, s1
-; CHECK-RV64V-NEXT:    sltu a1, s0, a0
+; CHECK-RV64V-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64V-NEXT:    addi a1, a1, -1
 ; CHECK-RV64V-NEXT:    and a0, a1, a0
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -36885,45 +36885,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV32V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT:    csrr a4, vlenb
+; CHECK-RV32V-NEXT:    csrr a1, vlenb
 ; CHECK-RV32V-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT:    slli a3, a4, 3
-; CHECK-RV32V-NEXT:    slli a1, a4, 2
-; CHECK-RV32V-NEXT:    add a0, a0, a3
-; CHECK-RV32V-NEXT:    sub a3, a2, a1
+; CHECK-RV32V-NEXT:    slli a4, a1, 3
+; CHECK-RV32V-NEXT:    slli a3, a1, 2
+; CHECK-RV32V-NEXT:    slli a1, a1, 1
+; CHECK-RV32V-NEXT:    add a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a2, a3
+; CHECK-RV32V-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32V-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT:    sltu a0, a2, a3
-; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a3, a0, a3
-; CHECK-RV32V-NEXT:    slli a0, a4, 1
-; CHECK-RV32V-NEXT:    sub a4, a3, a0
-; CHECK-RV32V-NEXT:    sltu a5, a3, a4
+; CHECK-RV32V-NEXT:    addi a0, a5, -1
+; CHECK-RV32V-NEXT:    and a0, a0, a4
+; CHECK-RV32V-NEXT:    sub a4, a0, a1
+; CHECK-RV32V-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32V-NEXT:    addi a5, a5, -1
 ; CHECK-RV32V-NEXT:    and a4, a5, a4
-; CHECK-RV32V-NEXT:    bltu a3, a0, .LBB914_2
+; CHECK-RV32V-NEXT:    bltu a0, a1, .LBB914_2
 ; CHECK-RV32V-NEXT:  # %bb.1:
-; CHECK-RV32V-NEXT:    mv a3, a0
+; CHECK-RV32V-NEXT:    mv a0, a1
 ; CHECK-RV32V-NEXT:  .LBB914_2:
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB914_4
+; CHECK-RV32V-NEXT:    bltu a2, a3, .LBB914_4
 ; CHECK-RV32V-NEXT:  # %bb.3:
-; CHECK-RV32V-NEXT:    mv a2, a1
+; CHECK-RV32V-NEXT:    mv a2, a3
 ; CHECK-RV32V-NEXT:  .LBB914_4:
-; CHECK-RV32V-NEXT:    sub a1, a2, a0
-; CHECK-RV32V-NEXT:    sltu a3, a2, a1
+; CHECK-RV32V-NEXT:    sub a0, a2, a1
+; CHECK-RV32V-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32V-NEXT:    addi a3, a3, -1
-; CHECK-RV32V-NEXT:    and a1, a3, a1
-; CHECK-RV32V-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT:    and a0, a3, a0
+; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT:    bltu a2, a0, .LBB914_6
+; CHECK-RV32V-NEXT:    bltu a2, a1, .LBB914_6
 ; CHECK-RV32V-NEXT:  # %bb.5:
-; CHECK-RV32V-NEXT:    mv a2, a0
+; CHECK-RV32V-NEXT:    mv a2, a1
 ; CHECK-RV32V-NEXT:  .LBB914_6:
 ; CHECK-RV32V-NEXT:    addi a0, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37011,33 +37011,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    li a1, 40
 ; CHECK-RV64VC-NEXT:    mv a0, s1
 ; CHECK-RV64VC-NEXT:    call __muldi3
-; CHECK-RV64VC-NEXT:    slli a7, s1, 2
-; CHECK-RV64VC-NEXT:    sub a1, s0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a1
-; CHECK-RV64VC-NEXT:    addi a2, a2, -1
-; CHECK-RV64VC-NEXT:    and a3, a2, a1
+; CHECK-RV64VC-NEXT:    slli a2, s1, 2
 ; CHECK-RV64VC-NEXT:    slli a1, s1, 1
-; CHECK-RV64VC-NEXT:    sub a2, a3, a1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a3, s0, a2
+; CHECK-RV64VC-NEXT:    sltu a4, a2, s0
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
-; CHECK-RV64VC-NEXT:    sub t0, a2, s1
-; CHECK-RV64VC-NEXT:    mv a5, a2
-; CHECK-RV64VC-NEXT:    bltu a2, s1, .LBB914_2
+; CHECK-RV64VC-NEXT:    and a3, a3, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a5, a1, a3
+; CHECK-RV64VC-NEXT:    addi a5, a5, -1
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
+; CHECK-RV64VC-NEXT:    mv t0, a5
+; CHECK-RV64VC-NEXT:    bltu a5, s1, .LBB914_2
 ; CHECK-RV64VC-NEXT:  # %bb.1:
-; CHECK-RV64VC-NEXT:    mv a5, s1
+; CHECK-RV64VC-NEXT:    mv t0, s1
 ; CHECK-RV64VC-NEXT:  .LBB914_2:
-; CHECK-RV64VC-NEXT:    sltu a6, a2, t0
+; CHECK-RV64VC-NEXT:    sltu a7, s1, a5
 ; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB914_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
 ; CHECK-RV64VC-NEXT:    mv a3, a1
 ; CHECK-RV64VC-NEXT:  .LBB914_4:
 ; CHECK-RV64VC-NEXT:    add a0, a0, s2
-; CHECK-RV64VC-NEXT:    addi a6, a6, -1
-; CHECK-RV64VC-NEXT:    sub a2, a3, s1
-; CHECK-RV64VC-NEXT:    sltu a4, a3, a2
+; CHECK-RV64VC-NEXT:    sub a6, a5, s1
+; CHECK-RV64VC-NEXT:    addi a7, a7, -1
+; CHECK-RV64VC-NEXT:    sub a5, a3, s1
+; CHECK-RV64VC-NEXT:    sltu a4, s1, a3
 ; CHECK-RV64VC-NEXT:    addi a4, a4, -1
-; CHECK-RV64VC-NEXT:    and a2, a2, a4
+; CHECK-RV64VC-NEXT:    and a5, a5, a4
 ; CHECK-RV64VC-NEXT:    bltu a3, s1, .LBB914_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a3, s1
@@ -37045,7 +37045,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-RV64VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v14, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -37053,7 +37053,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    add a0, a0, sp
 ; CHECK-RV64VC-NEXT:    addi a0, a0, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v13, (zero), v24
 ; CHECK-RV64VC-NEXT:    csrr a0, vlenb
@@ -37064,21 +37064,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT:    and a0, a6, t0
-; CHECK-RV64VC-NEXT:    bltu s0, a7, .LBB914_8
+; CHECK-RV64VC-NEXT:    and a0, a7, a6
+; CHECK-RV64VC-NEXT:    bltu s0, a2, .LBB914_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv s0, a7
+; CHECK-RV64VC-NEXT:    mv s0, a2
 ; CHECK-RV64VC-NEXT:  .LBB914_8:
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vluxei64.v v15, (zero), v16
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (s2)
 ; CHECK-RV64VC-NEXT:    sub a0, s0, a1
-; CHECK-RV64VC-NEXT:    sltu a2, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a2, a1, s0
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a2
 ; CHECK-RV64VC-NEXT:    sub a2, a0, s1
-; CHECK-RV64VC-NEXT:    sltu a3, a0, a2
+; CHECK-RV64VC-NEXT:    sltu a3, s1, a0
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a3
 ; CHECK-RV64VC-NEXT:    bltu a0, s1, .LBB914_10
@@ -37104,7 +37104,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV64VC-NEXT:    mv s0, a1
 ; CHECK-RV64VC-NEXT:  .LBB914_12:
 ; CHECK-RV64VC-NEXT:    sub a0, s0, s1
-; CHECK-RV64VC-NEXT:    sltu a1, s0, a0
+; CHECK-RV64VC-NEXT:    sltu a1, s1, s0
 ; CHECK-RV64VC-NEXT:    addi a1, a1, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -37159,45 +37159,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT:    csrr a4, vlenb
+; CHECK-RV32VC-NEXT:    csrr a1, vlenb
 ; CHECK-RV32VC-NEXT:    vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT:    slli a3, a4, 3
-; CHECK-RV32VC-NEXT:    slli a1, a4, 2
-; CHECK-RV32VC-NEXT:    add a0, a0, a3
-; CHECK-RV32VC-NEXT:    sub a3, a2, a1
+; CHECK-RV32VC-NEXT:    slli a4, a1, 3
+; CHECK-RV32VC-NEXT:    slli a3, a1, 2
+; CHECK-RV32VC-NEXT:    slli a1, a1, 1
+; CHECK-RV32VC-NEXT:    add a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a2, a3
+; CHECK-RV32VC-NEXT:    sltu a5, a3, a2
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    sltu a0, a2, a3
-; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a3, a3, a0
-; CHECK-RV32VC-NEXT:    slli a0, a4, 1
-; CHECK-RV32VC-NEXT:    sub a4, a3, a0
-; CHECK-RV32VC-NEXT:    sltu a5, a3, a4
+; CHECK-RV32VC-NEXT:    addi a0, a5, -1
+; CHECK-RV32VC-NEXT:    and a0, a0, a4
+; CHECK-RV32VC-NEXT:    sub a4, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a5, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a5, a5, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a5
-; CHECK-RV32VC-NEXT:    bltu a3, a0, .LBB914_2
+; CHECK-RV32VC-NEXT:    bltu a0, a1, .LBB914_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
-; CHECK-RV32VC-NEXT:    mv a3, a0
+; CHECK-RV32VC-NEXT:    mv a0, a1
 ; CHECK-RV32VC-NEXT:  .LBB914_2:
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB914_4
+; CHECK-RV32VC-NEXT:    bltu a2, a3, .LBB914_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
-; CHECK-RV32VC-NEXT:    mv a2, a1
+; CHECK-RV32VC-NEXT:    mv a2, a3
 ; CHECK-RV32VC-NEXT:  .LBB914_4:
-; CHECK-RV32VC-NEXT:    sub a1, a2, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a2, a1
+; CHECK-RV32VC-NEXT:    sub a0, a2, a1
+; CHECK-RV32VC-NEXT:    sltu a3, a1, a2
 ; CHECK-RV32VC-NEXT:    addi a3, a3, -1
-; CHECK-RV32VC-NEXT:    and a1, a1, a3
-; CHECK-RV32VC-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT:    and a0, a0, a3
+; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT:    bltu a2, a0, .LBB914_6
+; CHECK-RV32VC-NEXT:    bltu a2, a1, .LBB914_6
 ; CHECK-RV32VC-NEXT:  # %bb.5:
-; CHECK-RV32VC-NEXT:    mv a2, a0
+; CHECK-RV32VC-NEXT:    mv a2, a1
 ; CHECK-RV32VC-NEXT:  .LBB914_6:
 ; CHECK-RV32VC-NEXT:    addi a0, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37342,9 +37342,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -37360,17 +37360,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -37391,23 +37391,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -37424,7 +37424,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.p1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -37493,13 +37493,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -37511,10 +37511,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.p1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -37609,102 +37609,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB915_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB915_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB915_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB915_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB915_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB915_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB915_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB915_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB915_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB915_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB915_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB915_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB915_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -37734,7 +37734,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.p1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -37803,13 +37803,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -37821,10 +37821,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.p1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -37967,9 +37967,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -37985,17 +37985,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -38016,23 +38016,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -38049,7 +38049,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64V-NEXT:    ntl.pall
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -38118,13 +38118,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -38136,10 +38136,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.pall
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -38234,102 +38234,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB916_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB916_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB916_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB916_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB916_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB916_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB916_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB916_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB916_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB916_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB916_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB916_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB916_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -38359,7 +38359,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV64VC-NEXT:    c.ntl.pall
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -38428,13 +38428,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -38446,10 +38446,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.pall
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -38592,9 +38592,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -38610,17 +38610,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -38641,23 +38641,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -38674,7 +38674,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64V-NEXT:    ntl.s1
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -38743,13 +38743,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -38761,10 +38761,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.s1
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -38859,102 +38859,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB917_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB917_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB917_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB917_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB917_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB917_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB917_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB917_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB917_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB917_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB917_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB917_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB917_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -38984,7 +38984,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV64VC-NEXT:    c.ntl.s1
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -39053,13 +39053,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -39071,10 +39071,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.s1
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -39217,9 +39217,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -39235,17 +39235,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -39266,23 +39266,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -39299,7 +39299,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -39368,13 +39368,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -39386,10 +39386,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -39484,102 +39484,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB918_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB918_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB918_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB918_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB918_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB918_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB918_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB918_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB918_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB918_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB918_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB918_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB918_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -39609,7 +39609,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -39678,13 +39678,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -39696,10 +39696,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -39841,9 +39841,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64V-NEXT:    sub a0, a4, s0
+; CHECK-RV64V-NEXT:    sltu a4, s0, a4
 ; CHECK-RV64V-NEXT:    sub a5, a3, a1
-; CHECK-RV64V-NEXT:    sltu a4, a4, a0
-; CHECK-RV64V-NEXT:    sltu a3, a3, a5
+; CHECK-RV64V-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64V-NEXT:    addi a4, a4, -1
 ; CHECK-RV64V-NEXT:    addi a3, a3, -1
 ; CHECK-RV64V-NEXT:    and a4, a4, a0
@@ -39859,17 +39859,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT:    sub a3, a0, s0
-; CHECK-RV64V-NEXT:    sub a2, s1, a2
-; CHECK-RV64V-NEXT:    sltu a0, a0, a3
-; CHECK-RV64V-NEXT:    sltu a4, s1, a2
+; CHECK-RV64V-NEXT:    sub a3, s1, a2
+; CHECK-RV64V-NEXT:    sltu a2, a2, s1
+; CHECK-RV64V-NEXT:    sub a4, a0, s0
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    addi a4, a4, -1
-; CHECK-RV64V-NEXT:    and a3, a0, a3
-; CHECK-RV64V-NEXT:    and a0, a4, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, -1
+; CHECK-RV64V-NEXT:    and a4, a0, a4
+; CHECK-RV64V-NEXT:    and a0, a2, a3
 ; CHECK-RV64V-NEXT:    addi a2, sp, 16
 ; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a2, a0
@@ -39890,23 +39890,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT:    sub a3, a2, s0
-; CHECK-RV64V-NEXT:    sub a1, a0, a1
-; CHECK-RV64V-NEXT:    sltu a2, a2, a3
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sub a3, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, a1, a0
+; CHECK-RV64V-NEXT:    sub a1, a2, s0
+; CHECK-RV64V-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64V-NEXT:    addi a2, a2, -1
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
-; CHECK-RV64V-NEXT:    and a2, a2, a3
-; CHECK-RV64V-NEXT:    and a0, a0, a1
-; CHECK-RV64V-NEXT:    csrr a1, vlenb
-; CHECK-RV64V-NEXT:    slli a1, a1, 3
-; CHECK-RV64V-NEXT:    mv a3, a1
-; CHECK-RV64V-NEXT:    slli a1, a1, 1
-; CHECK-RV64V-NEXT:    add a1, a1, a3
-; CHECK-RV64V-NEXT:    add a1, sp, a1
-; CHECK-RV64V-NEXT:    addi a1, a1, 16
-; CHECK-RV64V-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT:    and a1, a2, a1
+; CHECK-RV64V-NEXT:    and a0, a0, a3
+; CHECK-RV64V-NEXT:    csrr a2, vlenb
+; CHECK-RV64V-NEXT:    slli a2, a2, 3
+; CHECK-RV64V-NEXT:    mv a3, a2
+; CHECK-RV64V-NEXT:    slli a2, a2, 1
+; CHECK-RV64V-NEXT:    add a2, a2, a3
+; CHECK-RV64V-NEXT:    add a2, sp, a2
+; CHECK-RV64V-NEXT:    addi a2, a2, 16
+; CHECK-RV64V-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v29, (zero), v8
 ; CHECK-RV64V-NEXT:    mv a1, a0
@@ -39923,7 +39923,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64V-NEXT:    ntl.all
 ; CHECK-RV64V-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64V-NEXT:    sub a1, a0, s0
-; CHECK-RV64V-NEXT:    sltu a0, a0, a1
+; CHECK-RV64V-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64V-NEXT:    addi a0, a0, -1
 ; CHECK-RV64V-NEXT:    and a0, a0, a1
 ; CHECK-RV64V-NEXT:    csrr a1, vlenb
@@ -39992,13 +39992,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32V-NEXT:    sub a0, a4, a1
-; CHECK-RV32V-NEXT:    sub a2, a3, a2
-; CHECK-RV32V-NEXT:    sltu a4, a4, a0
-; CHECK-RV32V-NEXT:    sltu a3, a3, a2
+; CHECK-RV32V-NEXT:    sltu a4, a1, a4
+; CHECK-RV32V-NEXT:    sub a5, a3, a2
+; CHECK-RV32V-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32V-NEXT:    addi a4, a4, -1
-; CHECK-RV32V-NEXT:    addi a3, a3, -1
+; CHECK-RV32V-NEXT:    addi a2, a2, -1
 ; CHECK-RV32V-NEXT:    and a4, a4, a0
-; CHECK-RV32V-NEXT:    and a0, a3, a2
+; CHECK-RV32V-NEXT:    and a0, a2, a5
 ; CHECK-RV32V-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v10, (zero), v0
@@ -40010,10 +40010,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32V-NEXT:    ntl.all
 ; CHECK-RV32V-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT:    sub a1, a0, a1
-; CHECK-RV32V-NEXT:    sltu a0, a0, a1
+; CHECK-RV32V-NEXT:    sub a2, a0, a1
+; CHECK-RV32V-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32V-NEXT:    addi a0, a0, -1
-; CHECK-RV32V-NEXT:    and a0, a0, a1
+; CHECK-RV32V-NEXT:    and a0, a0, a2
 ; CHECK-RV32V-NEXT:    addi a1, sp, 16
 ; CHECK-RV32V-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32V-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -40108,102 +40108,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64VC-NEXT:    mv a3, a6
 ; CHECK-RV64VC-NEXT:  .LBB919_2:
 ; CHECK-RV64VC-NEXT:    slli a5, s0, 4
-; CHECK-RV64VC-NEXT:    slli a7, s0, 1
-; CHECK-RV64VC-NEXT:    slli a2, s0, 3
+; CHECK-RV64VC-NEXT:    slli a1, s0, 1
+; CHECK-RV64VC-NEXT:    slli a7, s0, 3
 ; CHECK-RV64VC-NEXT:    mv a4, a3
-; CHECK-RV64VC-NEXT:    bltu a3, a7, .LBB919_4
+; CHECK-RV64VC-NEXT:    bltu a3, a1, .LBB919_4
 ; CHECK-RV64VC-NEXT:  # %bb.3:
-; CHECK-RV64VC-NEXT:    mv a4, a7
+; CHECK-RV64VC-NEXT:    mv a4, a1
 ; CHECK-RV64VC-NEXT:  .LBB919_4:
 ; CHECK-RV64VC-NEXT:    vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT:    add a1, s1, a0
+; CHECK-RV64VC-NEXT:    add a2, s1, a0
 ; CHECK-RV64VC-NEXT:    add a5, a5, s1
-; CHECK-RV64VC-NEXT:    add a2, a2, s1
+; CHECK-RV64VC-NEXT:    add a7, a7, s1
 ; CHECK-RV64VC-NEXT:    mv a0, a4
 ; CHECK-RV64VC-NEXT:    bltu a4, s0, .LBB919_6
 ; CHECK-RV64VC-NEXT:  # %bb.5:
 ; CHECK-RV64VC-NEXT:    mv a0, s0
 ; CHECK-RV64VC-NEXT:  .LBB919_6:
-; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 3
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 3
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-RV64VC-NEXT:    vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT:    addi a1, sp, 16
-; CHECK-RV64VC-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    mv a2, a1
-; CHECK-RV64VC-NEXT:    slli a1, a1, 1
-; CHECK-RV64VC-NEXT:    add a1, a1, a2
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    csrr a1, vlenb
-; CHECK-RV64VC-NEXT:    slli a1, a1, 4
-; CHECK-RV64VC-NEXT:    add a1, a1, sp
-; CHECK-RV64VC-NEXT:    addi a1, a1, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    addi a2, sp, 16
+; CHECK-RV64VC-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT:    vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    mv a5, a2
+; CHECK-RV64VC-NEXT:    slli a2, a2, 1
+; CHECK-RV64VC-NEXT:    add a2, a2, a5
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    csrr a2, vlenb
+; CHECK-RV64VC-NEXT:    slli a2, a2, 4
+; CHECK-RV64VC-NEXT:    add a2, a2, sp
+; CHECK-RV64VC-NEXT:    addi a2, a2, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v24, (zero), v16
 ; CHECK-RV64VC-NEXT:    sub a0, a4, s0
-; CHECK-RV64VC-NEXT:    sub a1, a3, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a4, a0
-; CHECK-RV64VC-NEXT:    sltu a3, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a4
+; CHECK-RV64VC-NEXT:    sub a4, a3, a1
+; CHECK-RV64VC-NEXT:    sltu a3, a1, a3
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
 ; CHECK-RV64VC-NEXT:    and a2, a2, a0
-; CHECK-RV64VC-NEXT:    and a0, a3, a1
+; CHECK-RV64VC-NEXT:    and a0, a3, a4
 ; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT:    mv a1, a0
+; CHECK-RV64VC-NEXT:    mv a2, a0
 ; CHECK-RV64VC-NEXT:    bltu a0, s0, .LBB919_8
 ; CHECK-RV64VC-NEXT:  # %bb.7:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a2, s0
 ; CHECK-RV64VC-NEXT:  .LBB919_8:
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT:    sub a1, a0, s0
 ; CHECK-RV64VC-NEXT:    sub a2, s2, a6
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
-; CHECK-RV64VC-NEXT:    sltu a3, s2, a2
+; CHECK-RV64VC-NEXT:    sltu a3, a6, s2
+; CHECK-RV64VC-NEXT:    sub a4, a0, s0
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    addi a3, a3, -1
-; CHECK-RV64VC-NEXT:    and a1, a1, a0
+; CHECK-RV64VC-NEXT:    and a4, a4, a0
 ; CHECK-RV64VC-NEXT:    and a0, a3, a2
 ; CHECK-RV64VC-NEXT:    addi a2, sp, 16
 ; CHECK-RV64VC-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v27, (zero), v8
 ; CHECK-RV64VC-NEXT:    mv a2, a0
-; CHECK-RV64VC-NEXT:    bltu a0, a7, .LBB919_10
+; CHECK-RV64VC-NEXT:    bltu a0, a1, .LBB919_10
 ; CHECK-RV64VC-NEXT:  # %bb.9:
-; CHECK-RV64VC-NEXT:    mv a2, a7
+; CHECK-RV64VC-NEXT:    mv a2, a1
 ; CHECK-RV64VC-NEXT:  .LBB919_10:
-; CHECK-RV64VC-NEXT:    mv a1, a2
+; CHECK-RV64VC-NEXT:    mv a3, a2
 ; CHECK-RV64VC-NEXT:    bltu a2, s0, .LBB919_12
 ; CHECK-RV64VC-NEXT:  # %bb.11:
-; CHECK-RV64VC-NEXT:    mv a1, s0
+; CHECK-RV64VC-NEXT:    mv a3, s0
 ; CHECK-RV64VC-NEXT:  .LBB919_12:
-; CHECK-RV64VC-NEXT:    csrr a3, vlenb
-; CHECK-RV64VC-NEXT:    slli a3, a3, 3
-; CHECK-RV64VC-NEXT:    add a3, a3, sp
-; CHECK-RV64VC-NEXT:    addi a3, a3, 16
-; CHECK-RV64VC-NEXT:    vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT:    csrr a4, vlenb
+; CHECK-RV64VC-NEXT:    slli a4, a4, 3
+; CHECK-RV64VC-NEXT:    add a4, a4, sp
+; CHECK-RV64VC-NEXT:    addi a4, a4, 16
+; CHECK-RV64VC-NEXT:    vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT:    sub a3, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV64VC-NEXT:    sub a1, a2, s0
-; CHECK-RV64VC-NEXT:    sub a3, a0, a7
-; CHECK-RV64VC-NEXT:    sltu a2, a2, a1
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a3
+; CHECK-RV64VC-NEXT:    sltu a2, s0, a2
 ; CHECK-RV64VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a1, a1, a2
@@ -40233,7 +40233,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV64VC-NEXT:    c.ntl.all
 ; CHECK-RV64VC-NEXT:    vsoxei64.v v30, (zero), v8
 ; CHECK-RV64VC-NEXT:    sub a1, a0, s0
-; CHECK-RV64VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV64VC-NEXT:    sltu a0, s0, a0
 ; CHECK-RV64VC-NEXT:    addi a0, a0, -1
 ; CHECK-RV64VC-NEXT:    and a0, a0, a1
 ; CHECK-RV64VC-NEXT:    csrr a1, vlenb
@@ -40302,13 +40302,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v8, (zero), v16
 ; CHECK-RV32VC-NEXT:    sub a0, a4, a1
-; CHECK-RV32VC-NEXT:    sub a2, a3, a2
-; CHECK-RV32VC-NEXT:    sltu a4, a4, a0
-; CHECK-RV32VC-NEXT:    sltu a3, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a4, a1, a4
+; CHECK-RV32VC-NEXT:    sub a5, a3, a2
+; CHECK-RV32VC-NEXT:    sltu a2, a2, a3
 ; CHECK-RV32VC-NEXT:    addi a4, a4, -1
-; CHECK-RV32VC-NEXT:    addi a3, a3, -1
+; CHECK-RV32VC-NEXT:    addi a2, a2, -1
 ; CHECK-RV32VC-NEXT:    and a4, a4, a0
-; CHECK-RV32VC-NEXT:    and a0, a3, a2
+; CHECK-RV32VC-NEXT:    and a0, a2, a5
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v10, (zero), v0
@@ -40320,10 +40320,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-RV32VC-NEXT:    c.ntl.all
 ; CHECK-RV32VC-NEXT:    vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT:    sub a1, a0, a1
-; CHECK-RV32VC-NEXT:    sltu a0, a0, a1
+; CHECK-RV32VC-NEXT:    sub a2, a0, a1
+; CHECK-RV32VC-NEXT:    sltu a0, a1, a0
 ; CHECK-RV32VC-NEXT:    addi a0, a0, -1
-; CHECK-RV32VC-NEXT:    and a0, a0, a1
+; CHECK-RV32VC-NEXT:    and a0, a0, a2
 ; CHECK-RV32VC-NEXT:    addi a1, sp, 16
 ; CHECK-RV32VC-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-RV32VC-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 380287dd555c9..1c95c753c8ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -263,13 +263,13 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -321,14 +321,14 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -786,13 +786,13 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -858,14 +858,14 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1751,7 +1751,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1793,7 +1793,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1834,7 +1834,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV32ZVFMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFMIN-NEXT:    vmv1r.v v0, v6
@@ -1876,7 +1876,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -1917,7 +1917,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1947,7 +1947,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -1976,7 +1976,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2006,7 +2006,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 37c036d38148a..605b07c81f45a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index 37a9ec1c0a8aa..6869bc2050698 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 5553b988fec97..8869a440c8634 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v6, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v5, v6
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    fmv.w.x fa5, a3
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v16, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vmv1r.v v6, v7
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v5, v6
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vmv1r.v v6, v7
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFH-NEXT:    sub a2, a0, a1
 ; RV32ZVFH-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT:    sltu a3, a0, a2
+; RV32ZVFH-NEXT:    sltu a3, a1, a0
 ; RV32ZVFH-NEXT:    addi a3, a3, -1
 ; RV32ZVFH-NEXT:    and a2, a3, a2
 ; RV32ZVFH-NEXT:    vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; RV32ZVFHMIN-NEXT:    sub a2, a0, a1
 ; RV32ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT:    sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a3, a3, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a3, a2
 ; RV32ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV32ZVFH-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFH-NEXT:    sub a3, a0, a1
 ; RV32ZVFH-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT:    sltu a2, a0, a3
+; RV32ZVFH-NEXT:    sltu a2, a1, a0
 ; RV32ZVFH-NEXT:    addi a2, a2, -1
 ; RV32ZVFH-NEXT:    and a2, a2, a3
 ; RV32ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV64ZVFH-NEXT:    sub a3, a0, a1
 ; RV64ZVFH-NEXT:    slli a2, a2, 52
 ; RV64ZVFH-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFH-NEXT:    sltu a2, a0, a3
+; RV64ZVFH-NEXT:    sltu a2, a1, a0
 ; RV64ZVFH-NEXT:    addi a2, a2, -1
 ; RV64ZVFH-NEXT:    and a2, a2, a3
 ; RV64ZVFH-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV32ZVFHMIN-NEXT:    lui a2, %hi(.LCPI45_0)
 ; RV32ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV32ZVFHMIN-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV32ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV32ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV32ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; RV64ZVFHMIN-NEXT:    sub a3, a0, a1
 ; RV64ZVFHMIN-NEXT:    slli a2, a2, 52
 ; RV64ZVFHMIN-NEXT:    fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT:    sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT:    sltu a2, a1, a0
 ; RV64ZVFHMIN-NEXT:    addi a2, a2, -1
 ; RV64ZVFHMIN-NEXT:    and a2, a2, a3
 ; RV64ZVFHMIN-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 634e58198def3..b67ab5c3c9efa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1337,211 +1337,404 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 }
 
 define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: fcmp_oeq_vv_nxv64bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv8r.v v0, v16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    slli a5, a3, 2
-; CHECK-NEXT:    slli a4, a3, 1
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    sub a6, a2, a5
-; CHECK-NEXT:    vl8re16.v v24, (a1)
-; CHECK-NEXT:    sltu a1, a2, a6
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a6, a1, a6
-; CHECK-NEXT:    sub a1, a6, a4
-; CHECK-NEXT:    sltu a7, a6, a1
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    and a7, a7, a1
-; CHECK-NEXT:    srli a1, a3, 1
-; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 1
-; CHECK-NEXT:    mv t1, t0
-; CHECK-NEXT:    slli t0, t0, 2
-; CHECK-NEXT:    add t1, t1, t0
-; CHECK-NEXT:    slli t0, t0, 1
-; CHECK-NEXT:    add t0, t0, t1
-; CHECK-NEXT:    add t0, sp, t0
-; CHECK-NEXT:    addi t0, t0, 16
-; CHECK-NEXT:    vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vslidedown.vx v16, v8, a1
-; CHECK-NEXT:    vl8re16.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv t0, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, t0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a3
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT:    bltu a6, a4, .LBB85_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a6, a4
-; CHECK-NEXT:  .LBB85_2:
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v7, v24, v16, v0.t
-; CHECK-NEXT:    bltu a2, a5, .LBB85_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a2, a5
-; CHECK-NEXT:  .LBB85_4:
-; CHECK-NEXT:    sub a0, a2, a4
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    mv a6, a5
-; CHECK-NEXT:    slli a5, a5, 2
-; CHECK-NEXT:    add a6, a6, a5
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a3
-; CHECK-NEXT:    sltu a5, a2, a0
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a0, a5, a0
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    mv a6, a5
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v10, v16, v24, v0.t
-; CHECK-NEXT:    vmv1r.v v9, v7
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v5, a3
-; CHECK-NEXT:    bltu a2, a4, .LBB85_6
-; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    mv a2, a4
-; CHECK-NEXT:  .LBB85_6:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a4, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a3
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; CHECK32-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    addi sp, sp, -16
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    mv a3, a1
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    add a3, a3, a1
+; CHECK32-NEXT:    slli a1, a1, 2
+; CHECK32-NEXT:    add a3, a3, a1
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    add a1, a1, a3
+; CHECK32-NEXT:    sub sp, sp, a1
+; CHECK32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    mv a3, a1
+; CHECK32-NEXT:    slli a1, a1, 2
+; CHECK32-NEXT:    add a3, a3, a1
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    add a1, a1, a3
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK32-NEXT:    vmv8r.v v0, v16
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    mv a3, a1
+; CHECK32-NEXT:    slli a1, a1, 3
+; CHECK32-NEXT:    add a1, a1, a3
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    csrr a3, vlenb
+; CHECK32-NEXT:    srli a1, a3, 1
+; CHECK32-NEXT:    slli a4, a3, 3
+; CHECK32-NEXT:    slli a6, a3, 2
+; CHECK32-NEXT:    slli a5, a3, 1
+; CHECK32-NEXT:    add a4, a0, a4
+; CHECK32-NEXT:    sub a7, a2, a6
+; CHECK32-NEXT:    sltu t0, a6, a2
+; CHECK32-NEXT:    vl8re16.v v24, (a4)
+; CHECK32-NEXT:    addi t0, t0, -1
+; CHECK32-NEXT:    and a7, t0, a7
+; CHECK32-NEXT:    sub a4, a7, a5
+; CHECK32-NEXT:    sltu t0, a5, a7
+; CHECK32-NEXT:    addi t0, t0, -1
+; CHECK32-NEXT:    and t0, t0, a4
+; CHECK32-NEXT:    srli a4, a3, 2
+; CHECK32-NEXT:    csrr t1, vlenb
+; CHECK32-NEXT:    slli t1, t1, 1
+; CHECK32-NEXT:    mv t2, t1
+; CHECK32-NEXT:    slli t1, t1, 2
+; CHECK32-NEXT:    add t2, t2, t1
+; CHECK32-NEXT:    slli t1, t1, 1
+; CHECK32-NEXT:    add t1, t1, t2
+; CHECK32-NEXT:    add t1, sp, t1
+; CHECK32-NEXT:    addi t1, t1, 16
+; CHECK32-NEXT:    vl1r.v v8, (t1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vslidedown.vx v16, v8, a1
+; CHECK32-NEXT:    vl8re16.v v8, (a0)
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv t1, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a0, a0, t1
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslidedown.vx v8, v16, a4
+; CHECK32-NEXT:    addi a0, sp, 16
+; CHECK32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    vsetvli zero, t0, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; CHECK32-NEXT:    bltu a7, a5, .LBB85_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    mv a7, a5
+; CHECK32-NEXT:  .LBB85_2:
+; CHECK32-NEXT:    addi a0, sp, 16
+; CHECK32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v5, v8, v16, v0.t
+; CHECK32-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v7, v24, v16, v0.t
+; CHECK32-NEXT:    bltu a2, a6, .LBB85_4
+; CHECK32-NEXT:  # %bb.3:
+; CHECK32-NEXT:    mv a2, a6
+; CHECK32-NEXT:  .LBB85_4:
+; CHECK32-NEXT:    sub a0, a2, a5
+; CHECK32-NEXT:    sltu a6, a5, a2
+; CHECK32-NEXT:    csrr a7, vlenb
+; CHECK32-NEXT:    slli a7, a7, 1
+; CHECK32-NEXT:    mv t0, a7
+; CHECK32-NEXT:    slli a7, a7, 2
+; CHECK32-NEXT:    add t0, t0, a7
+; CHECK32-NEXT:    slli a7, a7, 1
+; CHECK32-NEXT:    add a7, a7, t0
+; CHECK32-NEXT:    add a7, sp, a7
+; CHECK32-NEXT:    addi a7, a7, 16
+; CHECK32-NEXT:    vl1r.v v8, (a7) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslidedown.vx v0, v8, a4
+; CHECK32-NEXT:    addi a6, a6, -1
+; CHECK32-NEXT:    and a0, a6, a0
+; CHECK32-NEXT:    csrr a6, vlenb
+; CHECK32-NEXT:    slli a6, a6, 1
+; CHECK32-NEXT:    mv a7, a6
+; CHECK32-NEXT:    slli a6, a6, 3
+; CHECK32-NEXT:    add a6, a6, a7
+; CHECK32-NEXT:    add a6, sp, a6
+; CHECK32-NEXT:    addi a6, a6, 16
+; CHECK32-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a6, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a0, a0, a6
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v10, v16, v24, v0.t
+; CHECK32-NEXT:    vmv1r.v v9, v7
+; CHECK32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslideup.vx v9, v5, a4
+; CHECK32-NEXT:    bltu a2, a5, .LBB85_6
+; CHECK32-NEXT:  # %bb.5:
+; CHECK32-NEXT:    mv a2, a5
+; CHECK32-NEXT:  .LBB85_6:
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a5, a0
+; CHECK32-NEXT:    slli a0, a0, 3
+; CHECK32-NEXT:    add a0, a0, a5
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a2, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a0, a0, a2
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a2, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a2, a2, a0
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, a0, a2
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslideup.vx v8, v10, a4
+; CHECK32-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK32-NEXT:    vslideup.vx v8, v9, a1
+; CHECK32-NEXT:    vmv.v.v v0, v8
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    mv a1, a0
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a1, a1, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a1, a1, a0
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, a0, a1
+; CHECK32-NEXT:    add sp, sp, a0
+; CHECK32-NEXT:    .cfi_def_cfa sp, 16
+; CHECK32-NEXT:    addi sp, sp, 16
+; CHECK32-NEXT:    .cfi_def_cfa_offset 0
+; CHECK32-NEXT:    ret
+;
+; CHECK64-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    addi sp, sp, -16
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    mv a3, a1
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    add a3, a3, a1
+; CHECK64-NEXT:    slli a1, a1, 2
+; CHECK64-NEXT:    add a3, a3, a1
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    add a1, a1, a3
+; CHECK64-NEXT:    sub sp, sp, a1
+; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    mv a3, a1
+; CHECK64-NEXT:    slli a1, a1, 3
+; CHECK64-NEXT:    add a1, a1, a3
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK64-NEXT:    vmv8r.v v0, v16
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    mv a3, a1
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    add a3, a3, a1
+; CHECK64-NEXT:    slli a1, a1, 3
+; CHECK64-NEXT:    add a1, a1, a3
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    csrr a3, vlenb
+; CHECK64-NEXT:    slli a1, a3, 3
+; CHECK64-NEXT:    slli a5, a3, 2
+; CHECK64-NEXT:    slli a4, a3, 1
+; CHECK64-NEXT:    add a1, a0, a1
+; CHECK64-NEXT:    sub a6, a2, a5
+; CHECK64-NEXT:    sltu a7, a5, a2
+; CHECK64-NEXT:    vl8re16.v v24, (a1)
+; CHECK64-NEXT:    addi a7, a7, -1
+; CHECK64-NEXT:    and a6, a7, a6
+; CHECK64-NEXT:    sub a1, a6, a4
+; CHECK64-NEXT:    sltu a7, a4, a6
+; CHECK64-NEXT:    addi a7, a7, -1
+; CHECK64-NEXT:    and a7, a7, a1
+; CHECK64-NEXT:    srli a1, a3, 1
+; CHECK64-NEXT:    srli a3, a3, 2
+; CHECK64-NEXT:    csrr t0, vlenb
+; CHECK64-NEXT:    slli t0, t0, 1
+; CHECK64-NEXT:    mv t1, t0
+; CHECK64-NEXT:    slli t0, t0, 3
+; CHECK64-NEXT:    add t0, t0, t1
+; CHECK64-NEXT:    add t0, sp, t0
+; CHECK64-NEXT:    addi t0, t0, 16
+; CHECK64-NEXT:    vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vslidedown.vx v16, v8, a1
+; CHECK64-NEXT:    vl8re16.v v8, (a0)
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv t0, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a0, a0, t0
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslidedown.vx v8, v16, a3
+; CHECK64-NEXT:    addi a0, sp, 16
+; CHECK64-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; CHECK64-NEXT:    bltu a6, a4, .LBB85_2
+; CHECK64-NEXT:  # %bb.1:
+; CHECK64-NEXT:    mv a6, a4
+; CHECK64-NEXT:  .LBB85_2:
+; CHECK64-NEXT:    addi a0, sp, 16
+; CHECK64-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v5, v8, v16, v0.t
+; CHECK64-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v6, v24, v16, v0.t
+; CHECK64-NEXT:    bltu a2, a5, .LBB85_4
+; CHECK64-NEXT:  # %bb.3:
+; CHECK64-NEXT:    mv a2, a5
+; CHECK64-NEXT:  .LBB85_4:
+; CHECK64-NEXT:    sub a0, a2, a4
+; CHECK64-NEXT:    sltu a5, a4, a2
+; CHECK64-NEXT:    csrr a6, vlenb
+; CHECK64-NEXT:    slli a6, a6, 1
+; CHECK64-NEXT:    mv a7, a6
+; CHECK64-NEXT:    slli a6, a6, 3
+; CHECK64-NEXT:    add a6, a6, a7
+; CHECK64-NEXT:    add a6, sp, a6
+; CHECK64-NEXT:    addi a6, a6, 16
+; CHECK64-NEXT:    vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslidedown.vx v0, v7, a3
+; CHECK64-NEXT:    addi a5, a5, -1
+; CHECK64-NEXT:    and a0, a5, a0
+; CHECK64-NEXT:    csrr a5, vlenb
+; CHECK64-NEXT:    mv a6, a5
+; CHECK64-NEXT:    slli a5, a5, 1
+; CHECK64-NEXT:    add a6, a6, a5
+; CHECK64-NEXT:    slli a5, a5, 3
+; CHECK64-NEXT:    add a5, a5, a6
+; CHECK64-NEXT:    add a5, sp, a5
+; CHECK64-NEXT:    addi a5, a5, 16
+; CHECK64-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv a5, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a0, a0, a5
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v4, v16, v24, v0.t
+; CHECK64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslideup.vx v6, v5, a3
+; CHECK64-NEXT:    bltu a2, a4, .LBB85_6
+; CHECK64-NEXT:  # %bb.5:
+; CHECK64-NEXT:    mv a2, a4
+; CHECK64-NEXT:  .LBB85_6:
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    mv a4, a0
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a4, a4, a0
+; CHECK64-NEXT:    slli a0, a0, 3
+; CHECK64-NEXT:    add a0, a0, a4
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv a2, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a0, a0, a2
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT:    vmv1r.v v0, v7
+; CHECK64-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslideup.vx v8, v4, a3
+; CHECK64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK64-NEXT:    vslideup.vx v8, v6, a1
+; CHECK64-NEXT:    vmv.v.v v0, v8
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    mv a1, a0
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a1, a1, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a1, a1, a0
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a0, a0, a1
+; CHECK64-NEXT:    add sp, sp, a0
+; CHECK64-NEXT:    .cfi_def_cfa sp, 16
+; CHECK64-NEXT:    addi sp, sp, 16
+; CHECK64-NEXT:    .cfi_def_cfa_offset 0
+; CHECK64-NEXT:    ret
   %v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i1> %v
 }
@@ -3479,257 +3672,6 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 }
 
 define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    addi a1, sp, 16
-; ZVFH-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    csrr a3, vlenb
-; ZVFH-NEXT:    srli a1, a3, 1
-; ZVFH-NEXT:    slli a4, a3, 3
-; ZVFH-NEXT:    slli a3, a3, 2
-; ZVFH-NEXT:    add a4, a0, a4
-; ZVFH-NEXT:    sub a5, a2, a3
-; ZVFH-NEXT:    vl8re16.v v24, (a4)
-; ZVFH-NEXT:    sltu a4, a2, a5
-; ZVFH-NEXT:    addi a4, a4, -1
-; ZVFH-NEXT:    vl8re16.v v8, (a0)
-; ZVFH-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFH-NEXT:    and a4, a4, a5
-; ZVFH-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v6, v16, v24, v0.t
-; ZVFH-NEXT:    bltu a2, a3, .LBB171_2
-; ZVFH-NEXT:  # %bb.1:
-; ZVFH-NEXT:    mv a2, a3
-; ZVFH-NEXT:  .LBB171_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v16, v24, v8, v0.t
-; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vslideup.vx v16, v6, a1
-; ZVFH-NEXT:    vmv.v.v v0, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a3, a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a3, a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a3, a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a1, a3, 3
-; ZVFHMIN-NEXT:    slli a5, a3, 2
-; ZVFHMIN-NEXT:    slli a4, a3, 1
-; ZVFHMIN-NEXT:    add a1, a0, a1
-; ZVFHMIN-NEXT:    sub a6, a2, a5
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a1)
-; ZVFHMIN-NEXT:    sltu a1, a2, a6
-; ZVFHMIN-NEXT:    addi a1, a1, -1
-; ZVFHMIN-NEXT:    and a6, a1, a6
-; ZVFHMIN-NEXT:    sub a1, a6, a4
-; ZVFHMIN-NEXT:    sltu a7, a6, a1
-; ZVFHMIN-NEXT:    addi a7, a7, -1
-; ZVFHMIN-NEXT:    and a7, a7, a1
-; ZVFHMIN-NEXT:    srli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    csrr t0, vlenb
-; ZVFHMIN-NEXT:    slli t0, t0, 1
-; ZVFHMIN-NEXT:    mv t1, t0
-; ZVFHMIN-NEXT:    slli t0, t0, 2
-; ZVFHMIN-NEXT:    add t1, t1, t0
-; ZVFHMIN-NEXT:    slli t0, t0, 1
-; ZVFHMIN-NEXT:    add t0, t0, t1
-; ZVFHMIN-NEXT:    add t0, sp, t0
-; ZVFHMIN-NEXT:    addi t0, t0, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v8, a1
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv t0, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, t0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v16, a3
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a6, a4
-; ZVFHMIN-NEXT:  .LBB171_2:
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
-; ZVFHMIN-NEXT:  # %bb.3:
-; ZVFHMIN-NEXT:    mv a2, a5
-; ZVFHMIN-NEXT:  .LBB171_4:
-; ZVFHMIN-NEXT:    sub a0, a2, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    mv a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 2
-; ZVFHMIN-NEXT:    add a6, a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a5, a2, a0
-; ZVFHMIN-NEXT:    addi a5, a5, -1
-; ZVFHMIN-NEXT:    and a0, a5, a0
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    mv a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 3
-; ZVFHMIN-NEXT:    add a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a5, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v10, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v9, v7
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v9, v5, a3
-; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
-; ZVFHMIN-NEXT:  # %bb.5:
-; ZVFHMIN-NEXT:    mv a2, a4
-; ZVFHMIN-NEXT:  .LBB171_6:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a4, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, a0, a4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v10, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a1
-; ZVFHMIN-NEXT:    vmv.v.v v0, v8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
   ret <vscale x 64 x i1> %v
 }
@@ -4879,7 +4821,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    add a4, s3, a6
 ; CHECK32-NEXT:    vl8re64.v v24, (s3)
 ; CHECK32-NEXT:    sub a6, a3, s0
-; CHECK32-NEXT:    sltu a7, a3, a6
+; CHECK32-NEXT:    sltu a7, s0, a3
 ; CHECK32-NEXT:    addi a7, a7, -1
 ; CHECK32-NEXT:    and a6, a7, a6
 ; CHECK32-NEXT:    csrr a7, vlenb
@@ -4919,7 +4861,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK32-NEXT:    vl8re64.v v16, (a4)
 ; CHECK32-NEXT:    sub a1, s1, a2
-; CHECK32-NEXT:    sltu a2, s1, a1
+; CHECK32-NEXT:    sltu a2, a2, s1
 ; CHECK32-NEXT:    vl8re64.v v24, (s2)
 ; CHECK32-NEXT:    addi a2, a2, -1
 ; CHECK32-NEXT:    and s1, a2, a1
@@ -4964,7 +4906,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK32-NEXT:    vslideup.vx v9, v8, s4
 ; CHECK32-NEXT:    sub a1, s1, s0
-; CHECK32-NEXT:    sltu a2, s1, a1
+; CHECK32-NEXT:    sltu a2, s0, s1
 ; CHECK32-NEXT:    addi a2, a2, -1
 ; CHECK32-NEXT:    and a1, a2, a1
 ; CHECK32-NEXT:    csrr a2, vlenb
@@ -4979,7 +4921,8 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; CHECK32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK32-NEXT:    vmfeq.vv v8, v24, v16, v0.t
-; CHECK32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    srli s0, s0, 1
+; CHECK32-NEXT:    vsetvli zero, s0, e8, mf2, ta, ma
 ; CHECK32-NEXT:    vslideup.vx v9, v8, a0
 ; CHECK32-NEXT:    vmv1r.v v0, v9
 ; CHECK32-NEXT:    csrr a0, vlenb
@@ -5090,7 +5033,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK64-NEXT:    add a4, s3, a6
 ; CHECK64-NEXT:    vl8re64.v v24, (s3)
 ; CHECK64-NEXT:    sub a6, a3, s0
-; CHECK64-NEXT:    sltu a7, a3, a6
+; CHECK64-NEXT:    sltu a7, s0, a3
 ; CHECK64-NEXT:    addi a7, a7, -1
 ; CHECK64-NEXT:    and a6, a7, a6
 ; CHECK64-NEXT:    csrr a7, vlenb
@@ -5130,7 +5073,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
 ; CHECK64-NEXT:    vl8re64.v v16, (a4)
 ; CHECK64-NEXT:    sub a1, s1, a2
-; CHECK64-NEXT:    sltu a2, s1, a1
+; CHECK64-NEXT:    sltu a2, a2, s1
 ; CHECK64-NEXT:    vl8re64.v v24, (s2)
 ; CHECK64-NEXT:    addi a2, a2, -1
 ; CHECK64-NEXT:    and s1, a2, a1
@@ -5175,7 +5118,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK64-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK64-NEXT:    vslideup.vx v9, v8, s4
 ; CHECK64-NEXT:    sub a1, s1, s0
-; CHECK64-NEXT:    sltu a2, s1, a1
+; CHECK64-NEXT:    sltu a2, s0, s1
 ; CHECK64-NEXT:    addi a2, a2, -1
 ; CHECK64-NEXT:    and a1, a2, a1
 ; CHECK64-NEXT:    csrr a2, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index c1de57bf850ac..829a3b43bd984 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1083,7 +1083,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    sub a4, a3, a1
 ; CHECK-NEXT:    vl8r.v v24, (a2)
-; CHECK-NEXT:    sltu a2, a3, a4
+; CHECK-NEXT:    sltu a2, a1, a3
 ; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
@@ -1120,7 +1120,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -1150,7 +1150,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_swap_nxv128i8(<vscale x 128 x i8> %va, i8
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -2195,81 +2195,155 @@ define <vscale x 8 x i1> @icmp_sle_vi_swap_nxv8i32(<vscale x 8 x i32> %va, <vsca
 }
 
 define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vv_nxv32i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    srli a1, a3, 2
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    sub a5, a2, a3
-; CHECK-NEXT:    vl8re32.v v24, (a4)
-; CHECK-NEXT:    sltu a4, a2, a5
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vl8re32.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    and a4, a4, a5
-; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
-; CHECK-NEXT:    bltu a2, a3, .LBB189_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a2, a3
-; CHECK-NEXT:  .LBB189_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v6, a1
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; RV32-LABEL: icmp_eq_vv_nxv32i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v7, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    srli a1, a3, 2
+; RV32-NEXT:    slli a5, a3, 3
+; RV32-NEXT:    slli a4, a3, 1
+; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    sub a6, a2, a4
+; RV32-NEXT:    vl8re32.v v24, (a5)
+; RV32-NEXT:    sltu a5, a4, a2
+; RV32-NEXT:    addi a5, a5, -1
+; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    vslidedown.vx v0, v0, a1
+; RV32-NEXT:    and a0, a5, a6
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vv v6, v16, v24, v0.t
+; RV32-NEXT:    bltu a2, a4, .LBB189_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:  .LBB189_2:
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vv v16, v24, v8, v0.t
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v16, v6, a1
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: icmp_eq_vv_nxv32i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v7, v0
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    srli a1, a3, 2
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    slli a3, a3, 1
+; RV64-NEXT:    add a4, a0, a4
+; RV64-NEXT:    sub a5, a2, a3
+; RV64-NEXT:    vl8re32.v v24, (a4)
+; RV64-NEXT:    sltu a4, a3, a2
+; RV64-NEXT:    addi a4, a4, -1
+; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    vslidedown.vx v0, v0, a1
+; RV64-NEXT:    and a4, a4, a5
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vv v6, v16, v24, v0.t
+; RV64-NEXT:    bltu a2, a3, .LBB189_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB189_2:
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vv v16, v24, v8, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v16, v6, a1
+; RV64-NEXT:    vmv1r.v v0, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
   %v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i1> %v
 }
 
 define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_nxv32i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    srli a2, a3, 2
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a4, a1, a3
-; CHECK-NEXT:    sltu a5, a1, a4
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT:    bltu a1, a3, .LBB190_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB190_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v25, a2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    ret
+; RV32-LABEL: icmp_eq_vx_nxv32i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    srli a2, a3, 2
+; RV32-NEXT:    slli a4, a3, 1
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
+; RV32-NEXT:    sub a5, a1, a4
+; RV32-NEXT:    sltu a6, a4, a1
+; RV32-NEXT:    addi a6, a6, -1
+; RV32-NEXT:    and a5, a6, a5
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT:    bltu a1, a4, .LBB190_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a1, a4
+; RV32-NEXT:  .LBB190_2:
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v16, v25, a2
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: icmp_eq_vx_nxv32i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v24, v0
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    srli a2, a3, 2
+; RV64-NEXT:    slli a3, a3, 1
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
+; RV64-NEXT:    sub a4, a1, a3
+; RV64-NEXT:    sltu a5, a3, a1
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    and a4, a5, a4
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT:    bltu a1, a3, .LBB190_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a1, a3
+; RV64-NEXT:  .LBB190_2:
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v16, v25, a2
+; RV64-NEXT:    vmv1r.v v0, v16
+; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
@@ -2277,31 +2351,58 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 }
 
 define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_swap_nxv32i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    srli a2, a3, 2
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a4, a1, a3
-; CHECK-NEXT:    sltu a5, a1, a4
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT:    bltu a1, a3, .LBB191_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB191_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v25, a2
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    ret
+; RV32-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    srli a2, a3, 2
+; RV32-NEXT:    slli a4, a3, 1
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
+; RV32-NEXT:    sub a5, a1, a4
+; RV32-NEXT:    sltu a6, a4, a1
+; RV32-NEXT:    addi a6, a6, -1
+; RV32-NEXT:    and a5, a6, a5
+; RV32-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT:    bltu a1, a4, .LBB191_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a1, a4
+; RV32-NEXT:  .LBB191_2:
+; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v16, v25, a2
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v24, v0
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    srli a2, a3, 2
+; RV64-NEXT:    slli a3, a3, 1
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
+; RV64-NEXT:    sub a4, a1, a3
+; RV64-NEXT:    sltu a5, a3, a1
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    and a4, a5, a4
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT:    bltu a1, a3, .LBB191_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a1, a3
+; RV64-NEXT:  .LBB191_2:
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v16, v25, a2
+; RV64-NEXT:    vmv1r.v v0, v16
+; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %vb, <vscale x 32 x i32> %va, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 6381887a1a2f9..3d34a619ce8bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -595,7 +595,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV32-NEXT:    csrr a4, vlenb
 ; CHECK-RV32-NEXT:    sub a2, a3, a4
-; CHECK-RV32-NEXT:    sltu a5, a3, a2
+; CHECK-RV32-NEXT:    sltu a5, a4, a3
 ; CHECK-RV32-NEXT:    addi a5, a5, -1
 ; CHECK-RV32-NEXT:    and a2, a5, a2
 ; CHECK-RV32-NEXT:    bltu a3, a4, .LBB55_2
@@ -621,7 +621,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    vmv1r.v v9, v0
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    sub a3, a2, a4
-; CHECK-RV64-NEXT:    sltu a5, a2, a3
+; CHECK-RV64-NEXT:    sltu a5, a4, a2
 ; CHECK-RV64-NEXT:    addi a5, a5, -1
 ; CHECK-RV64-NEXT:    and a3, a5, a3
 ; CHECK-RV64-NEXT:    bltu a2, a4, .LBB55_2
@@ -647,19 +647,19 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
 define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 %stride, i32 zeroext %evl) {
 ; CHECK-RV32-LABEL: strided_load_nxv16f64_allones_mask:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    csrr a4, vlenb
-; CHECK-RV32-NEXT:    sub a2, a3, a4
-; CHECK-RV32-NEXT:    sltu a5, a3, a2
+; CHECK-RV32-NEXT:    csrr a2, vlenb
+; CHECK-RV32-NEXT:    sub a4, a3, a2
+; CHECK-RV32-NEXT:    sltu a5, a2, a3
 ; CHECK-RV32-NEXT:    addi a5, a5, -1
-; CHECK-RV32-NEXT:    and a2, a5, a2
-; CHECK-RV32-NEXT:    bltu a3, a4, .LBB56_2
+; CHECK-RV32-NEXT:    and a4, a5, a4
+; CHECK-RV32-NEXT:    bltu a3, a2, .LBB56_2
 ; CHECK-RV32-NEXT:  # %bb.1:
-; CHECK-RV32-NEXT:    mv a3, a4
+; CHECK-RV32-NEXT:    mv a3, a2
 ; CHECK-RV32-NEXT:  .LBB56_2:
-; CHECK-RV32-NEXT:    mul a4, a3, a1
-; CHECK-RV32-NEXT:    add a4, a0, a4
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v16, (a4), a1
+; CHECK-RV32-NEXT:    mul a2, a3, a1
+; CHECK-RV32-NEXT:    add a2, a0, a2
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v16, (a2), a1
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v8, (a0), a1
 ; CHECK-RV32-NEXT:    ret
@@ -668,7 +668,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    sub a3, a2, a4
-; CHECK-RV64-NEXT:    sltu a5, a2, a3
+; CHECK-RV64-NEXT:    sltu a5, a4, a2
 ; CHECK-RV64-NEXT:    addi a5, a5, -1
 ; CHECK-RV64-NEXT:    and a3, a5, a3
 ; CHECK-RV64-NEXT:    bltu a2, a4, .LBB56_2
@@ -703,7 +703,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:    mv a6, a7
 ; CHECK-RV32-NEXT:  .LBB57_2:
 ; CHECK-RV32-NEXT:    sub a5, a6, a2
-; CHECK-RV32-NEXT:    sltu t0, a6, a5
+; CHECK-RV32-NEXT:    sltu t0, a2, a6
 ; CHECK-RV32-NEXT:    addi t0, t0, -1
 ; CHECK-RV32-NEXT:    and t0, t0, a5
 ; CHECK-RV32-NEXT:    mv a5, a6
@@ -713,15 +713,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:  .LBB57_4:
 ; CHECK-RV32-NEXT:    mul t1, a5, a1
 ; CHECK-RV32-NEXT:    srli t2, a2, 3
-; CHECK-RV32-NEXT:    sub a7, a3, a7
 ; CHECK-RV32-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vx v0, v8, t2
+; CHECK-RV32-NEXT:    sub t2, a3, a7
 ; CHECK-RV32-NEXT:    add t1, a0, t1
 ; CHECK-RV32-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV32-NEXT:    sltu a3, a3, a7
+; CHECK-RV32-NEXT:    sltu a3, a7, a3
 ; CHECK-RV32-NEXT:    addi a3, a3, -1
-; CHECK-RV32-NEXT:    and a3, a3, a7
+; CHECK-RV32-NEXT:    and a3, a3, t2
 ; CHECK-RV32-NEXT:    bltu a3, a2, .LBB57_6
 ; CHECK-RV32-NEXT:  # %bb.5:
 ; CHECK-RV32-NEXT:    mv a3, a2
@@ -751,7 +751,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:    mv a6, a7
 ; CHECK-RV64-NEXT:  .LBB57_2:
 ; CHECK-RV64-NEXT:    sub a5, a6, a4
-; CHECK-RV64-NEXT:    sltu t0, a6, a5
+; CHECK-RV64-NEXT:    sltu t0, a4, a6
 ; CHECK-RV64-NEXT:    addi t0, t0, -1
 ; CHECK-RV64-NEXT:    and t0, t0, a5
 ; CHECK-RV64-NEXT:    mv a5, a6
@@ -761,15 +761,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:  .LBB57_4:
 ; CHECK-RV64-NEXT:    mul t1, a5, a1
 ; CHECK-RV64-NEXT:    srli t2, a4, 3
-; CHECK-RV64-NEXT:    sub a7, a2, a7
 ; CHECK-RV64-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vx v0, v8, t2
+; CHECK-RV64-NEXT:    sub t2, a2, a7
 ; CHECK-RV64-NEXT:    add t1, a0, t1
 ; CHECK-RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV64-NEXT:    sltu a2, a2, a7
+; CHECK-RV64-NEXT:    sltu a2, a7, a2
 ; CHECK-RV64-NEXT:    addi a2, a2, -1
-; CHECK-RV64-NEXT:    and a2, a2, a7
+; CHECK-RV64-NEXT:    and a2, a2, t2
 ; CHECK-RV64-NEXT:    bltu a2, a4, .LBB57_6
 ; CHECK-RV64-NEXT:  # %bb.5:
 ; CHECK-RV64-NEXT:    mv a2, a4
@@ -861,10 +861,10 @@ define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, p
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    srli a2, a1, 3
-; CHECK-RV32-NEXT:    sub a1, a2, a1
-; CHECK-RV32-NEXT:    sltu a3, a2, a1
-; CHECK-RV32-NEXT:    addi a3, a3, -1
-; CHECK-RV32-NEXT:    and a1, a3, a1
+; CHECK-RV32-NEXT:    sub a3, a2, a1
+; CHECK-RV32-NEXT:    sltu a1, a1, a2
+; CHECK-RV32-NEXT:    addi a1, a1, -1
+; CHECK-RV32-NEXT:    and a1, a1, a3
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v24, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 2ec89888af077..12ff5e98c00e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -492,12 +492,12 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
 ; CHECK-NEXT:    sub a5, a2, a3
+; CHECK-NEXT:    sltu a2, a3, a2
 ; CHECK-NEXT:    mul a4, a4, a1
 ; CHECK-NEXT:    srli a3, a3, 3
-; CHECK-NEXT:    sltu a2, a2, a5
+; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    add a0, a0, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -508,25 +508,45 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 }
 
 define void @strided_store_nxv16f64_allones_mask(<vscale x 16 x double> %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
-; CHECK-LABEL: strided_store_nxv16f64_allones_mask:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB47_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a3, a4
-; CHECK-NEXT:  .LBB47_2:
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v8, (a0), a1
-; CHECK-NEXT:    sub a4, a2, a4
-; CHECK-NEXT:    mul a3, a3, a1
-; CHECK-NEXT:    sltu a2, a2, a4
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a4
-; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v16, (a0), a1
-; CHECK-NEXT:    ret
+; CHECK-RV32-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    csrr a3, vlenb
+; CHECK-RV32-NEXT:    mv a4, a2
+; CHECK-RV32-NEXT:    bltu a2, a3, .LBB47_2
+; CHECK-RV32-NEXT:  # %bb.1:
+; CHECK-RV32-NEXT:    mv a4, a3
+; CHECK-RV32-NEXT:  .LBB47_2:
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-RV32-NEXT:    sub a5, a2, a3
+; CHECK-RV32-NEXT:    sltu a2, a3, a2
+; CHECK-RV32-NEXT:    mul a3, a4, a1
+; CHECK-RV32-NEXT:    addi a2, a2, -1
+; CHECK-RV32-NEXT:    and a2, a2, a5
+; CHECK-RV32-NEXT:    add a0, a0, a3
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vsse64.v v16, (a0), a1
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    csrr a4, vlenb
+; CHECK-RV64-NEXT:    mv a3, a2
+; CHECK-RV64-NEXT:    bltu a2, a4, .LBB47_2
+; CHECK-RV64-NEXT:  # %bb.1:
+; CHECK-RV64-NEXT:    mv a3, a4
+; CHECK-RV64-NEXT:  .LBB47_2:
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vsse64.v v8, (a0), a1
+; CHECK-RV64-NEXT:    sub a5, a2, a4
+; CHECK-RV64-NEXT:    sltu a2, a4, a2
+; CHECK-RV64-NEXT:    mul a3, a3, a1
+; CHECK-RV64-NEXT:    addi a2, a2, -1
+; CHECK-RV64-NEXT:    and a2, a2, a5
+; CHECK-RV64-NEXT:    add a0, a0, a3
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vsse64.v v16, (a0), a1
+; CHECK-RV64-NEXT:    ret
   call void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32(<vscale x 16 x double> %v, ptr %ptr, i32 %stride, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret void
 }
@@ -554,19 +574,19 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a1), a2, v0.t
 ; CHECK-NEXT:    sub a0, a5, a4
-; CHECK-NEXT:    mul a7, a7, a2
-; CHECK-NEXT:    srli t0, a4, 3
-; CHECK-NEXT:    sub a6, a3, a6
+; CHECK-NEXT:    sub t0, a3, a6
+; CHECK-NEXT:    sltu a3, a6, a3
+; CHECK-NEXT:    srli a6, a4, 3
 ; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, t0
-; CHECK-NEXT:    sltu t0, a5, a0
+; CHECK-NEXT:    vslidedown.vx v0, v7, a6
+; CHECK-NEXT:    sltu a6, a4, a5
+; CHECK-NEXT:    mul a7, a7, a2
+; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    add a7, a1, a7
-; CHECK-NEXT:    sltu a3, a3, a6
-; CHECK-NEXT:    addi t0, t0, -1
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and t0, t0, a0
-; CHECK-NEXT:    and a0, a3, a6
-; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT:    and a6, a6, a0
+; CHECK-NEXT:    and a0, a3, t0
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a7), a2, v0.t
 ; CHECK-NEXT:    bltu a0, a4, .LBB48_6
 ; CHECK-NEXT:  # %bb.5:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index c64b755051898..6378135654ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -551,7 +551,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -1306,7 +1306,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1329,7 +1329,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1354,11 +1354,11 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
 ; RV32-NEXT:    srli a1, a0, 2
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vadd.vi v8, v8, -1, v0.t
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a2, a0, 1
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
-; RV32-NEXT:    slli a1, a0, 1
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    sub a1, a0, a2
+; RV32-NEXT:    sltu a0, a2, a0
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1374,7 +1374,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
 ; RV64-NEXT:    slli a1, a0, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
+; RV64-NEXT:    sltu a3, a1, a0
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index e0fcd4009ad2e..7d97f353a22b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -847,7 +847,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -869,7 +869,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index d03b068e11ea8..42b1da9d97f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -928,13 +928,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; ZVFH-NEXT:    slli a1, a2, 1
 ; ZVFH-NEXT:    srli a2, a2, 2
 ; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    sltu a4, a1, a0
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    addi a4, a4, -1
+; ZVFH-NEXT:    and a3, a4, a3
+; ZVFH-NEXT:    addi a2, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -977,13 +977,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1026,13 +1026,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    sltu a4, a1, a0
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1079,14 +1079,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFH-NEXT:    slli a1, a2, 1
 ; ZVFH-NEXT:    srli a2, a2, 2
 ; ZVFH-NEXT:    sub a3, a0, a1
-; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    sltu a4, a1, a0
+; ZVFH-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    addi a4, a4, -1
+; ZVFH-NEXT:    and a3, a4, a3
+; ZVFH-NEXT:    addi a2, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1128,14 +1128,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1177,14 +1177,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1216,130 +1216,6 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 4
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    fmv.x.h a1, fa0
-; ZVFH-NEXT:    csrr a2, vlenb
-; ZVFH-NEXT:    vmv.v.x v24, a1
-; ZVFH-NEXT:    slli a1, a2, 1
-; ZVFH-NEXT:    srli a2, a2, 2
-; ZVFH-NEXT:    sub a3, a0, a1
-; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    csrr a3, vlenb
-; ZVFH-NEXT:    slli a3, a3, 3
-; ZVFH-NEXT:    add a3, sp, a3
-; ZVFH-NEXT:    addi a3, a3, 16
-; ZVFH-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT:    bltu a0, a1, .LBB24_2
-; ZVFH-NEXT:  # %bb.1:
-; ZVFH-NEXT:    mv a0, a1
-; ZVFH-NEXT:  .LBB24_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB24_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB24_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -1355,14 +1231,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8alt, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1397,108 +1273,6 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT:    fmv.x.h a1, fa0
-; ZVFH-NEXT:    csrr a2, vlenb
-; ZVFH-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFH-NEXT:    vmset.m v24
-; ZVFH-NEXT:    vmv.v.x v16, a1
-; ZVFH-NEXT:    slli a1, a2, 1
-; ZVFH-NEXT:    srli a2, a2, 2
-; ZVFH-NEXT:    sub a3, a0, a1
-; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFH-NEXT:    sltu a2, a0, a3
-; ZVFH-NEXT:    addi a2, a2, -1
-; ZVFH-NEXT:    and a2, a2, a3
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT:    bltu a0, a1, .LBB25_2
-; ZVFH-NEXT:  # %bb.1:
-; ZVFH-NEXT:    mv a0, a1
-; ZVFH-NEXT:  .LBB25_2:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfadd.vv v16, v16, v24
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB25_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB25_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -1514,14 +1288,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8alt, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2351,13 +2125,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2400,13 +2174,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    sltu a4, a1, a0
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2459,14 +2233,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2508,14 +2282,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2553,68 +2327,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vfadd.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB50_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB50_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -2631,17 +2343,17 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    csrr a3, vlenb
-; ZVFBFA-NEXT:    slli a3, a3, 3
-; ZVFBFA-NEXT:    add a3, sp, a3
-; ZVFBFA-NEXT:    addi a3, a3, 16
-; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a2, a2, 3
+; ZVFBFA-NEXT:    add a2, sp, a2
+; ZVFBFA-NEXT:    addi a2, a2, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -2689,57 +2401,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vfadd.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB51_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB51_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-;
 ; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
 ; ZVFBFA:       # %bb.0:
 ; ZVFBFA-NEXT:    addi sp, sp, -16
@@ -2756,14 +2417,14 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    sltu a4, a1, a0
+; ZVFBFA-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT:    sltu a2, a0, a3
-; ZVFBFA-NEXT:    addi a2, a2, -1
-; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    addi a3, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    addi a4, a4, -1
+; ZVFBFA-NEXT:    and a3, a4, a3
+; ZVFBFA-NEXT:    addi a2, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index e9d7137919ac9..5f8603067d82a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 }
 
 define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB22_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB23_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfdiv.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfdiv.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB47_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index c25a0d47c5c53..03cbe8c5d555c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -600,16 +600,16 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
+; CHECK-NEXT:    sltu a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a1, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -716,17 +716,17 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a1, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -815,7 +815,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    csrr a4, vlenb
@@ -912,124 +912,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 }
 
 define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v3, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v4, v8, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB33_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB33_2:
-; CHECK-NEXT:    vmv1r.v v0, v3
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vmv4r.v v12, v4
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -1058,7 +940,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    sub a4, a0, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a3
-; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    csrr a4, vlenb
@@ -1161,107 +1043,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 }
 
 define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v8
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a3, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v8, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB35_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB35_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v0
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -2049,16 +1830,16 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
+; ZVFHMIN-NEXT:    sltu a4, a0, a1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a1, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2172,17 +1953,17 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a1, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2277,7 +2058,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -2378,153 +2159,34 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16, v0.t
+; ZVFH-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_commute:
+; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB69_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB69_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
@@ -2532,7 +2194,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -2640,108 +2302,6 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB71_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB71_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -3428,14 +2988,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    slli a5, a1, 3
 ; CHECK-NEXT:    sub a6, a4, a1
 ; CHECK-NEXT:    add a7, a2, a5
-; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a7)
 ; CHECK-NEXT:    csrr a7, vlenb
 ; CHECK-NEXT:    slli a7, a7, 3
 ; CHECK-NEXT:    add a7, sp, a7
 ; CHECK-NEXT:    addi a7, a7, 16
 ; CHECK-NEXT:    vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    sltu a7, a1, a4
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
@@ -3563,7 +3123,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a4, a5
+; CHECK-NEXT:    sltu a3, a1, a4
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
@@ -7976,35 +7536,36 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT:    sltu a3, a0, a1
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a2, v0.t
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v6
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -8013,37 +7574,37 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB280_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB280_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -8052,17 +7613,17 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v16, v8
 ; ZVFHMIN-NEXT:    vmv4r.v v12, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -8114,10 +7675,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    sltu a3, a0, a1
+; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -8229,7 +7790,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -8338,128 +7899,6 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmsub.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB283_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB283_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -8498,7 +7937,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -8606,9 +8045,75 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmsub.vf v8, fa0, v16
+; ZVFH-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT:    vmv.v.v v8, v16
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
@@ -8616,30 +8121,31 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v3, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -8654,7 +8160,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -8662,27 +8168,39 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB285_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB290_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB285_2:
+; ZVFHMIN-NEXT:  .LBB290_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -8690,1046 +8208,21 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT:    vmv.v.v v8, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 4
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB286_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB286_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB287_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB287_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB288_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB288_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB289_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB289_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB290_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB290_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB291_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB291_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB292_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB292_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB293_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB293_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
+; ZVFHMIN-NEXT:    vmv4r.v v12, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -9739,20 +8232,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
@@ -9760,30 +8253,31 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v3, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -9814,10 +8308,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB294_2
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB291_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB294_2:
+; ZVFHMIN-NEXT:  .LBB291_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -9870,249 +8364,134 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
-  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a3, 1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB295_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB295_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vmv.v.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v12, v4
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
-  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
-  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB296_2
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB292_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB296_2:
+; ZVFHMIN-NEXT:  .LBB292_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10120,20 +8499,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
@@ -10141,80 +8520,74 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB297_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB293_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB297_2:
+; ZVFHMIN-NEXT:  .LBB293_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10222,15 +8595,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10240,79 +8618,66 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
-  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT:    vmv.v.v v8, v16
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    slli a5, a5, 4
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10320,30 +8685,39 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB298_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB294_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB298_2:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB294_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
@@ -10353,27 +8727,19 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vmv.v.v v16, v8
 ; ZVFHMIN-NEXT:    vmv4r.v v12, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10381,68 +8747,68 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16, v0.t
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v3, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v6
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10450,38 +8816,38 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB299_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v8, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB295_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB299_2:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB295_2:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
@@ -10490,15 +8856,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v16, v8
@@ -10512,68 +8878,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10589,16 +8956,16 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB300_2
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB296_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB300_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB296_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
@@ -10607,7 +8974,7 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10615,14 +8982,14 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10630,68 +8997,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
-  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
-  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
 
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vl8re16.v v24, (a0)
-; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vf v8, fa0, v16
 ; ZVFH-NEXT:    ret
 ;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10699,33 +9067,33 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a1, a0, .LBB301_2
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB297_2
 ; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:  .LBB301_2:
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB297_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -10733,14 +9101,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10748,6 +9117,61 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+  %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+  %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT:    vmv.v.v v8, v16
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
+  %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vl8re16.v v24, (a0)
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT:    vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT:    ret
   %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -10781,7 +9205,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -10912,7 +9336,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -11038,7 +9462,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -11168,7 +9592,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -11296,11 +9720,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -11425,11 +9849,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -11560,11 +9984,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
@@ -11679,11 +10103,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a2, a4, a2
 ; ZVFHMIN-NEXT:    vmv4r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 394887fee67fc..803680dd09061 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index 5c5542619b6ef..43b62bb7f9f76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index eb77b4b4dbac3..39f0163de048c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -489,13 +489,13 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -548,14 +548,14 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -592,68 +592,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmul.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -666,57 +604,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfmul.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 03de2c97e685c..37ee3ad000854 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1096,14 +1096,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    slli a5, a1, 3
 ; CHECK-NEXT:    sub a6, a4, a1
 ; CHECK-NEXT:    add a7, a2, a5
-; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a7)
 ; CHECK-NEXT:    csrr a7, vlenb
 ; CHECK-NEXT:    slli a7, a7, 3
 ; CHECK-NEXT:    add a7, sp, a7
 ; CHECK-NEXT:    addi a7, a7, 16
 ; CHECK-NEXT:    vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    sltu a7, a1, a4
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
@@ -1217,7 +1217,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sltu a3, a4, a5
+; CHECK-NEXT:    sltu a3, a1, a4
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index 96fbe3f6ff025..a78fea1ef3110 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -799,7 +799,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -821,7 +821,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 458795db7965d..c759f2b48f53f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -93,7 +93,7 @@ define <vscale x 32 x float> @vfpext_nxv32f16_nxv32f32(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 7127d10e67dbc..5a0e0e8004af8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 07b58ed057508..03c5f7eed3fc0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 4177672b3a306..0f78e035e39d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -92,7 +92,7 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -135,11 +135,11 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    slli a3, a1, 1
 ; CHECK-NEXT:    add a6, a0, a4
 ; CHECK-NEXT:    sub a0, a2, a3
-; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    sltu a4, a3, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    sltu a7, a0, a4
+; CHECK-NEXT:    sltu a7, a1, a0
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    and a4, a7, a4
 ; CHECK-NEXT:    srli a7, a1, 2
@@ -162,7 +162,7 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB8_4:
 ; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 451b13edb794e..a77b8a6905f71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -161,7 +161,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    and a3, a4, a3
@@ -196,7 +196,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v16, a2
@@ -437,7 +437,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    and a3, a4, a3
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    addi a4, a4, -1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
@@ -715,7 +715,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <v
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -737,7 +737,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64_unmasked(<vscale x 16 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 6637aced3cdac..ce30d9257cb02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a4, a1, a0
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    sltu a4, a1, a0
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 }
 
 define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB22_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB23_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    sltu a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfsub.vf v8, v8, fa0, v0.t
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfsub.vf v8, v8, fa0
 ; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB47_2:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 3cf464247250a..df4b731015243 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmax.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index e755d099df4a8..9b5e83f94e5fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmaxu.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 961f63cbfbc95..1816b07c49c6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vmin.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 631799d24e14c..608790009bdb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vminu.vx v8, v8, a0, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    slli a3, a1, 1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
-; RV32-NEXT:    sltu a1, a1, a2
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; RV64-NEXT:    slli a2, a1, 1
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index c96a7d774a5d5..65d37bfb31916 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -700,17 +700,17 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
 ; CHECK-NEXT:    addi a3, sp, 64
 ; CHECK-NEXT:    li a4, -1
 ; CHECK-NEXT:    sub a5, a0, a2
-; CHECK-NEXT:    add a6, a0, a3
-; CHECK-NEXT:    sltu a0, a0, a5
-; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    sltu a6, a2, a0
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsse8.v v8, (a6), a4
-; CHECK-NEXT:    sub a6, a6, a1
-; CHECK-NEXT:    and a0, a0, a5
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsse8.v v16, (a6), a4
+; CHECK-NEXT:    vsse8.v v8, (a0), a4
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a5, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v16, (a0), a4
 ; CHECK-NEXT:    vle8.v v16, (a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
index b8b2ba7c5e5d3..aeee1fa8215f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
@@ -727,7 +727,7 @@ define <vscale x 32 x i32> @vp_splat_nxv32i32(i32 %val, <vscale x 32 x i1> %m, i
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index b83ddce61f44d..3d025a29e6725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -470,61 +470,61 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
 ; CHECK-LABEL: test_vp_splice_nxv16i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a5, a4, 1
+; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    slli a1, a4, 3
-; CHECK-NEXT:    slli a7, a4, 1
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    mv a6, a2
-; CHECK-NEXT:    bltu a2, a7, .LBB22_2
+; CHECK-NEXT:    mv a7, a2
+; CHECK-NEXT:    bltu a2, a5, .LBB22_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a6, a7
+; CHECK-NEXT:    mv a7, a5
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    addi sp, sp, -80
 ; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    addi s0, sp, 80
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 5
-; CHECK-NEXT:    sub sp, sp, a7
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 5
+; CHECK-NEXT:    sub sp, sp, a5
 ; CHECK-NEXT:    andi sp, sp, -64
-; CHECK-NEXT:    vl8re64.v v24, (a5)
-; CHECK-NEXT:    slli a5, a6, 3
+; CHECK-NEXT:    add a5, a0, a1
+; CHECK-NEXT:    slli a7, a7, 3
 ; CHECK-NEXT:    addi a6, sp, 64
-; CHECK-NEXT:    add a5, a6, a5
-; CHECK-NEXT:    mv a7, a2
+; CHECK-NEXT:    mv t0, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB22_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a7, a4
+; CHECK-NEXT:    mv t0, a4
 ; CHECK-NEXT:  .LBB22_4:
+; CHECK-NEXT:    vl8re64.v v24, (a5)
+; CHECK-NEXT:    add a5, a6, a7
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a6)
 ; CHECK-NEXT:    sub a0, a2, a4
-; CHECK-NEXT:    add a6, a6, a1
-; CHECK-NEXT:    sub a7, a3, a4
-; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    sltu a2, a4, a2
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a0
-; CHECK-NEXT:    sltu a0, a3, a7
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a7
-; CHECK-NEXT:    add a7, a5, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a6)
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    add a6, a6, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v24, (a7)
+; CHECK-NEXT:    vse64.v v16, (a6)
+; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:    bltu a3, a4, .LBB22_6
 ; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    mv a3, a4
+; CHECK-NEXT:    mv a0, a4
 ; CHECK-NEXT:  .LBB22_6:
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v0, (a5)
-; CHECK-NEXT:    addi a2, sp, 104
-; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v0, (a5)
+; CHECK-NEXT:    sub a2, a3, a4
+; CHECK-NEXT:    sltu a3, a4, a3
+; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    addi a4, sp, 104
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    add a1, a4, a1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v24, (a5)
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a4)
 ; CHECK-NEXT:    addi sp, s0, -80
 ; CHECK-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -537,66 +537,66 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
 define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 {
 ; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a1, a4, 3
-; CHECK-NEXT:    slli a7, a4, 1
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    mv a6, a2
-; CHECK-NEXT:    bltu a2, a7, .LBB23_2
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a6, a5, 1
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    slli a1, a5, 3
+; CHECK-NEXT:    mv a4, a2
+; CHECK-NEXT:    bltu a2, a6, .LBB23_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a6, a7
+; CHECK-NEXT:    mv a4, a6
 ; CHECK-NEXT:  .LBB23_2:
 ; CHECK-NEXT:    addi sp, sp, -80
 ; CHECK-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    addi s0, sp, 80
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 5
-; CHECK-NEXT:    sub sp, sp, a7
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 5
+; CHECK-NEXT:    sub sp, sp, a6
 ; CHECK-NEXT:    andi sp, sp, -64
-; CHECK-NEXT:    vl8re64.v v24, (a5)
-; CHECK-NEXT:    slli a5, a6, 3
+; CHECK-NEXT:    add a6, a0, a1
+; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    addi a7, sp, 64
-; CHECK-NEXT:    add a6, a7, a5
 ; CHECK-NEXT:    mv t0, a2
-; CHECK-NEXT:    bltu a2, a4, .LBB23_4
+; CHECK-NEXT:    bltu a2, a5, .LBB23_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv t0, a4
+; CHECK-NEXT:    mv t0, a5
 ; CHECK-NEXT:  .LBB23_4:
+; CHECK-NEXT:    vl8re64.v v24, (a6)
+; CHECK-NEXT:    add a6, a7, a4
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a7)
-; CHECK-NEXT:    sub a0, a2, a4
-; CHECK-NEXT:    add a7, a7, a1
-; CHECK-NEXT:    sub t0, a3, a4
-; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    sub a0, a2, a5
+; CHECK-NEXT:    sltu a2, a5, a2
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a0
-; CHECK-NEXT:    sltu a0, a3, t0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, t0
-; CHECK-NEXT:    add t0, a6, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a7)
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    add a7, a7, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v24, (t0)
-; CHECK-NEXT:    bltu a3, a4, .LBB23_6
+; CHECK-NEXT:    vse64.v v16, (a7)
+; CHECK-NEXT:    mv a0, a3
+; CHECK-NEXT:    bltu a3, a5, .LBB23_6
 ; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    mv a3, a4
+; CHECK-NEXT:    mv a0, a5
 ; CHECK-NEXT:  .LBB23_6:
-; CHECK-NEXT:    li a2, 8
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v0, (a6)
-; CHECK-NEXT:    bltu a5, a2, .LBB23_8
+; CHECK-NEXT:    sub a2, a3, a5
+; CHECK-NEXT:    sltu a3, a5, a3
+; CHECK-NEXT:    add a5, a6, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    li a3, 8
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vse64.v v24, (a5)
+; CHECK-NEXT:    bltu a4, a3, .LBB23_8
 ; CHECK-NEXT:  # %bb.7:
-; CHECK-NEXT:    li a5, 8
+; CHECK-NEXT:    li a4, 8
 ; CHECK-NEXT:  .LBB23_8:
-; CHECK-NEXT:    sub a2, a6, a5
+; CHECK-NEXT:    sub a2, a6, a4
 ; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    addi sp, s0, -80
 ; CHECK-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index a075bba81d3c6..fb8480ee5f471 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -254,7 +254,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV32-NEXT:    slli a2, a3, 1
 ; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    sub a4, a1, a2
-; RV32-NEXT:    sltu a5, a1, a4
+; RV32-NEXT:    sltu a5, a2, a1
 ; RV32-NEXT:    addi a5, a5, -1
 ; RV32-NEXT:    and a4, a5, a4
 ; RV32-NEXT:    vslidedown.vx v0, v0, a3
@@ -281,12 +281,12 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV64-NEXT:    slli a3, a2, 1
 ; RV64-NEXT:    srli a4, a2, 2
 ; RV64-NEXT:    sub a5, a1, a3
+; RV64-NEXT:    sltu a6, a3, a1
 ; RV64-NEXT:    vslidedown.vx v13, v0, a4
-; RV64-NEXT:    sltu a4, a1, a5
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a5, a4, a5
+; RV64-NEXT:    addi a6, a6, -1
+; RV64-NEXT:    and a5, a6, a5
 ; RV64-NEXT:    sub a4, a5, a2
-; RV64-NEXT:    sltu a6, a5, a4
+; RV64-NEXT:    sltu a6, a2, a5
 ; RV64-NEXT:    addi a6, a6, -1
 ; RV64-NEXT:    and a6, a6, a4
 ; RV64-NEXT:    srli a4, a2, 3
@@ -310,7 +310,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV64-NEXT:    mv a1, a3
 ; RV64-NEXT:  .LBB12_4:
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a5, a1, a3
+; RV64-NEXT:    sltu a5, a2, a1
 ; RV64-NEXT:    addi a5, a5, -1
 ; RV64-NEXT:    and a3, a5, a3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
@@ -2367,7 +2367,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV32-NEXT:    sub a2, a0, a1
 ; RV32-NEXT:    srli a3, a1, 3
 ; RV32-NEXT:    vslidedown.vx v0, v0, a3
-; RV32-NEXT:    sltu a3, a0, a2
+; RV32-NEXT:    sltu a3, a1, a0
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2390,7 +2390,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV64-NEXT:    sub a2, a0, a1
 ; RV64-NEXT:    srli a3, a1, 3
 ; RV64-NEXT:    vslidedown.vx v0, v0, a3
-; RV64-NEXT:    sltu a3, a0, a2
+; RV64-NEXT:    sltu a3, a1, a0
 ; RV64-NEXT:    addi a3, a3, -1
 ; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2422,8 +2422,8 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    srli a2, a2, 3
-; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
@@ -2443,7 +2443,7 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a4
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2479,8 +2479,8 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    srli a2, a2, 3
-; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
@@ -2500,7 +2500,7 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a4
-; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    sltu a4, a2, a1
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2537,8 +2537,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    srli a2, a2, 3
-; RV32-NEXT:    sltu a1, a1, a3
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a2
@@ -2561,8 +2561,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a1, a2
+; RV64-NEXT:    sltu a1, a2, a1
 ; RV64-NEXT:    srli a2, a2, 3
-; RV64-NEXT:    sltu a1, a1, a3
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index 2ece316c7e54a..4d2ba719d63ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -456,15 +456,15 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
 ; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    srli a5, a2, 3
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    sltu a5, a1, a3
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a3, a5, a3
-; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    slli a5, a2, 3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a4), v0.t
+; CHECK-NEXT:    vle64.v v16, (a5), v0.t
 ; CHECK-NEXT:    bltu a1, a2, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
@@ -496,18 +496,18 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
 ; CHECK-NEXT:    mv a4, a5
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    sub a6, a4, a3
-; CHECK-NEXT:    slli a7, a3, 3
-; CHECK-NEXT:    sltu t0, a4, a6
-; CHECK-NEXT:    addi t0, t0, -1
-; CHECK-NEXT:    and a6, t0, a6
-; CHECK-NEXT:    srli t0, a3, 3
-; CHECK-NEXT:    sub t1, a2, a5
-; CHECK-NEXT:    add a5, a0, a7
-; CHECK-NEXT:    sltu a2, a2, t1
+; CHECK-NEXT:    sltu a7, a3, a4
+; CHECK-NEXT:    sub t0, a2, a5
+; CHECK-NEXT:    sltu a2, a5, a2
+; CHECK-NEXT:    slli a5, a3, 3
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a6, a7, a6
+; CHECK-NEXT:    srli a7, a3, 3
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, t1
-; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, t0
+; CHECK-NEXT:    and a2, a2, t0
+; CHECK-NEXT:    vsetvli t0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a7
 ; CHECK-NEXT:    bltu a2, a3, .LBB45_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index f92ee37051840..01edd0f912bd6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -198,22 +198,22 @@ define <vscale x 128 x i1> @vpmerge_nxv128i1(<vscale x 128 x i1> %va, <vscale x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB7_2:
-; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT:    sub a3, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    sub a2, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
-; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmv1r.v v0, v4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
-; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmsne.vi v24, v8, 0
-; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vmv1r.v v0, v5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
@@ -547,7 +547,7 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    sub a4, a3, a1
 ; CHECK-NEXT:    vl8r.v v24, (a2)
-; CHECK-NEXT:    sltu a2, a3, a4
+; CHECK-NEXT:    sltu a2, a1, a3
 ; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
@@ -583,7 +583,7 @@ define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
@@ -611,7 +611,7 @@ define <vscale x 128 x i8> @vpmerge_vi_nxv128i8(<vscale x 128 x i8> %vb, <vscale
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 7e4a60095d7cc..153a0a70d098a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -2193,8 +2193,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
 ; RV32-NEXT:    sub a2, a1, a0
+; RV32-NEXT:    sltu a1, a0, a1
 ; RV32-NEXT:    srli a0, a0, 3
-; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a0
@@ -2226,8 +2226,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    sub a0, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a0
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
@@ -2263,8 +2263,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
 ; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
@@ -2298,8 +2298,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
@@ -2336,8 +2336,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
 ; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
@@ -2371,8 +2371,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
@@ -2410,8 +2410,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
 ; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
@@ -2435,8 +2435,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 9fd8b9d23cb5e..3468fda9011a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -373,8 +373,8 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
+; CHECK-NEXT:    sltu a1, a2, a1
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sltu a1, a1, a3
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a3
 ; CHECK-NEXT:    add a0, a0, a2
@@ -409,20 +409,20 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1), v0.t
 ; CHECK-NEXT:    sub a0, a5, a3
-; CHECK-NEXT:    srli a6, a3, 3
+; CHECK-NEXT:    sltu a5, a3, a5
+; CHECK-NEXT:    sub a6, a2, a4
+; CHECK-NEXT:    sltu a2, a4, a2
+; CHECK-NEXT:    srli a4, a3, 3
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a6
-; CHECK-NEXT:    slli a6, a3, 3
-; CHECK-NEXT:    sub a4, a2, a4
-; CHECK-NEXT:    sltu a5, a5, a0
-; CHECK-NEXT:    add a6, a1, a6
-; CHECK-NEXT:    sltu a2, a2, a4
+; CHECK-NEXT:    vslidedown.vx v0, v7, a4
+; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    add a4, a1, a4
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a5, a5, a0
-; CHECK-NEXT:    and a0, a2, a4
+; CHECK-NEXT:    and a0, a2, a6
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a6), v0.t
+; CHECK-NEXT:    vse64.v v16, (a4), v0.t
 ; CHECK-NEXT:    bltu a0, a3, .LBB36_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index df97f19df7f99..4f31167b80691 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -91,7 +91,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a1
 ; CHECK-NEXT:    sub a1, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a1, a3, a1
 ; CHECK-NEXT:    bltu a0, a2, .LBB6_2
@@ -120,7 +120,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a1
 ; CHECK-NEXT:    sub a1, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sltu a3, a2, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a1, a3, a1
 ; CHECK-NEXT:    bltu a0, a2, .LBB7_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 7eea35afe0aa0..f2b84c28db92e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -884,7 +884,7 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a2
 ; CHECK-NEXT:    sub a2, a1, a3
-; CHECK-NEXT:    sltu a4, a1, a2
+; CHECK-NEXT:    sltu a4, a3, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a2, a4, a2
 ; CHECK-NEXT:    bltu a1, a3, .LBB67_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
index 1e629e9d20530..535a5bdb839e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
@@ -318,7 +318,7 @@ define zeroext i1 @vpreduce_or_nxv128i1(i1 zeroext %s, <vscale x 128 x i1> %v, <
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    sltu a4, a2, a1
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index 98634fe55de41..b4ed1857652f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -557,7 +557,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -580,7 +580,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -1312,7 +1312,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1335,7 +1335,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index a7d304261f87f..d761b8da7929c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -556,7 +556,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -579,7 +579,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -1311,7 +1311,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1334,7 +1334,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index d1933560f2698..e6ef1bcf73a3d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -308,12 +308,12 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    slli a1, a3, 1
-; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    sub a5, a2, a1
 ; CHECK-NEXT:    vl8re32.v v24, (a4)
-; CHECK-NEXT:    sltu a4, a2, a5
+; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    vl8re32.v v8, (a0)
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    and a4, a4, a5
@@ -349,14 +349,14 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; RV32-NEXT:    slli a2, a1, 3
 ; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    sub a2, a1, a2
 ; RV32-NEXT:    vl8re32.v v24, (a0)
-; RV32-NEXT:    sltu a0, a1, a2
-; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    sub a0, a1, a2
+; RV32-NEXT:    sltu a2, a2, a1
+; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
-; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; RV32-NEXT:    ret
@@ -376,16 +376,16 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a3, a1, 3
 ; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    srli a4, a1, 2
 ; RV64-NEXT:    add a3, a0, a3
-; RV64-NEXT:    sub a5, a1, a2
+; RV64-NEXT:    sub a4, a1, a2
+; RV64-NEXT:    sltu a5, a2, a1
 ; RV64-NEXT:    vl8re32.v v24, (a3)
-; RV64-NEXT:    sltu a3, a1, a5
-; RV64-NEXT:    addi a3, a3, -1
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    srli a3, a1, 2
 ; RV64-NEXT:    vl8re32.v v8, (a0)
-; RV64-NEXT:    vslidedown.vx v0, v0, a4
-; RV64-NEXT:    and a3, a3, a5
-; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a3
+; RV64-NEXT:    and a4, a5, a4
+; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; RV64-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; RV64-NEXT:    bltu a1, a2, .LBB28_2
 ; RV64-NEXT:  # %bb.1:
@@ -637,10 +637,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    sub a4, a2, a1
+; CHECK-NEXT:    sltu a5, a1, a2
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    sltu a5, a2, a4
-; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 07411b1c7ae08..c8bb009d2c3b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 7f96da141c363..90f1ca0843b02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFH-NEXT:    sub a2, a0, a1
-; ZVFH-NEXT:    sltu a3, a0, a2
+; ZVFH-NEXT:    sltu a3, a1, a0
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index 0ac2ef7e251c0..a6a631be9dab4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -598,7 +598,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1353,7 +1353,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1377,7 +1377,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index bde279a4d1f2b..1992b97e0de0d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -572,7 +572,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub a2, a1, a0
-; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -596,7 +596,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1351,7 +1351,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
@@ -1375,7 +1375,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a2
 ; CHECK-NEXT:    li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index 0c1ca369521f7..0b07b60da8250 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -147,7 +147,7 @@ define <vscale x 15 x i16> @vtrunc_nxv15i16_nxv15i64(<vscale x 15 x i64> %a, <vs
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    sltu a2, a1, a0
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -201,7 +201,7 @@ define <vscale x 32 x i7> @vtrunc_nxv32i7_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -233,7 +233,7 @@ define <vscale x 32 x i8> @vtrunc_nxv32i8_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -280,11 +280,11 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    slli a3, a1, 1
 ; CHECK-NEXT:    add a6, a0, a4
 ; CHECK-NEXT:    sub a0, a2, a3
-; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    sltu a4, a3, a2
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a0, a4, a0
 ; CHECK-NEXT:    sub a4, a0, a1
-; CHECK-NEXT:    sltu a7, a0, a4
+; CHECK-NEXT:    sltu a7, a1, a0
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    and a4, a7, a4
 ; CHECK-NEXT:    srli a7, a1, 2
@@ -307,7 +307,7 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB17_4:
 ; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
+; CHECK-NEXT:    sltu a3, a1, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index c0c749ebf3186..807c2d9fa3ce6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFH-NEXT:    sub a2, a0, a1
-; ZVFH-NEXT:    sltu a3, a0, a2
+; ZVFH-NEXT:    sltu a3, a1, a0
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    sltu a3, a1, a0
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 9713b617b8384..44a1084b4a208 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    sltu a3, a1, a0
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/usub_sat.ll b/llvm/test/CodeGen/RISCV/usub_sat.ll
index 33056682dcc79..6fcc6bc5f3dcd 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat.ll
@@ -7,10 +7,10 @@
 define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-LABEL: func:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func:
@@ -57,10 +57,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: func2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func2:
@@ -93,18 +93,18 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
 ; RV32I-LABEL: func16:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func16:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func16:
@@ -125,18 +125,18 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
 define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
 ; RV32I-LABEL: func8:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func8:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func8:
@@ -157,18 +157,18 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
 define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
 ; RV32I-LABEL: func3:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func3:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func3:
diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
index ef6bc022ddc9f..838f2dbe2276d 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -8,10 +8,10 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32I-LABEL: func32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mul a1, a1, a2
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func32:
@@ -65,7 +65,7 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64I-LABEL: func64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sub a1, a0, a2
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sltu a0, a2, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -106,10 +106,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32I-NEXT:    addi a3, a3, -1
 ; RV32I-NEXT:    and a0, a0, a3
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func16:
@@ -119,10 +119,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64I-NEXT:    addi a3, a3, -1
 ; RV64I-NEXT:    and a0, a0, a3
 ; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func16:
@@ -153,10 +153,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV32I-NEXT:    zext.b a0, a0
 ; RV32I-NEXT:    mul a1, a1, a2
 ; RV32I-NEXT:    zext.b a1, a1
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func8:
@@ -164,10 +164,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64I-NEXT:    zext.b a0, a0
 ; RV64I-NEXT:    mul a1, a1, a2
 ; RV64I-NEXT:    zext.b a1, a1
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func8:
@@ -198,10 +198,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 15
 ; RV32I-NEXT:    mul a1, a1, a2
 ; RV32I-NEXT:    andi a1, a1, 15
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func4:
@@ -209,10 +209,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64I-NEXT:    andi a0, a0, 15
 ; RV64I-NEXT:    mul a1, a1, a2
 ; RV64I-NEXT:    andi a1, a1, 15
-; RV64I-NEXT:    sub a1, a0, a1
-; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    sub a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func4:
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 62f08d7831dda..0de2cbd76b749 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -937,9 +937,10 @@ entry:
 define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32-LABEL: usubo.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    sw a1, 0(a2)
+; RV32-NEXT:    sltu a3, a1, a0
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: usubo.i32:
@@ -951,9 +952,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZBA-LABEL: usubo.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a0, a1
-; RV32ZBA-NEXT:    sw a1, 0(a2)
+; RV32ZBA-NEXT:    sltu a3, a1, a0
+; RV32ZBA-NEXT:    sub a0, a0, a1
+; RV32ZBA-NEXT:    sw a0, 0(a2)
+; RV32ZBA-NEXT:    mv a0, a3
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: usubo.i32:
@@ -965,9 +967,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZICOND-LABEL: usubo.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a0, a1
-; RV32ZICOND-NEXT:    sw a1, 0(a2)
+; RV32ZICOND-NEXT:    sltu a3, a1, a0
+; RV32ZICOND-NEXT:    sub a0, a0, a1
+; RV32ZICOND-NEXT:    sw a0, 0(a2)
+; RV32ZICOND-NEXT:    mv a0, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: usubo.i32:
@@ -987,9 +990,11 @@ entry:
 define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
 ; RV32-LABEL: usubo.i32.constant.rhs:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a2, a0, 2
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    sw a2, 0(a1)
+; RV32-NEXT:    addi a2, a0, 1
+; RV32-NEXT:    seqz a2, a2
+; RV32-NEXT:    addi a0, a0, 2
+; RV32-NEXT:    sw a0, 0(a1)
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: usubo.i32.constant.rhs:
@@ -1001,9 +1006,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
 ;
 ; RV32ZBA-LABEL: usubo.i32.constant.rhs:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    addi a2, a0, 2
-; RV32ZBA-NEXT:    sltu a0, a0, a2
-; RV32ZBA-NEXT:    sw a2, 0(a1)
+; RV32ZBA-NEXT:    addi a2, a0, 1
+; RV32ZBA-NEXT:    seqz a2, a2
+; RV32ZBA-NEXT:    addi a0, a0, 2
+; RV32ZBA-NEXT:    sw a0, 0(a1)
+; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: usubo.i32.constant.rhs:
@@ -1015,9 +1022,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
 ;
 ; RV32ZICOND-LABEL: usubo.i32.constant.rhs:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    addi a2, a0, 2
-; RV32ZICOND-NEXT:    sltu a0, a0, a2
-; RV32ZICOND-NEXT:    sw a2, 0(a1)
+; RV32ZICOND-NEXT:    addi a2, a0, 1
+; RV32ZICOND-NEXT:    seqz a2, a2
+; RV32ZICOND-NEXT:    addi a0, a0, 2
+; RV32ZICOND-NEXT:    sw a0, 0(a1)
+; RV32ZICOND-NEXT:    mv a0, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: usubo.i32.constant.rhs:
@@ -1039,8 +1048,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a2, -2
 ; RV32-NEXT:    sub a2, a2, a0
-; RV32-NEXT:    addi a0, a2, 1
-; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    sltiu a0, a0, -2
 ; RV32-NEXT:    sw a2, 0(a1)
 ; RV32-NEXT:    ret
 ;
@@ -1057,8 +1065,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a2, -2
 ; RV32ZBA-NEXT:    sub a2, a2, a0
-; RV32ZBA-NEXT:    addi a0, a2, 1
-; RV32ZBA-NEXT:    seqz a0, a0
+; RV32ZBA-NEXT:    sltiu a0, a0, -2
 ; RV32ZBA-NEXT:    sw a2, 0(a1)
 ; RV32ZBA-NEXT:    ret
 ;
@@ -1075,8 +1082,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    li a2, -2
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    addi a0, a2, 1
-; RV32ZICOND-NEXT:    seqz a0, a0
+; RV32ZICOND-NEXT:    sltiu a0, a0, -2
 ; RV32ZICOND-NEXT:    sw a2, 0(a1)
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -1116,9 +1122,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64-LABEL: usubo.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
-; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    sltu a3, a1, a0
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    mv a0, a3
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: usubo.i64:
@@ -1140,9 +1147,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZBA-LABEL: usubo.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
-; RV64ZBA-NEXT:    sd a1, 0(a2)
+; RV64ZBA-NEXT:    sltu a3, a1, a0
+; RV64ZBA-NEXT:    sub a0, a0, a1
+; RV64ZBA-NEXT:    sd a0, 0(a2)
+; RV64ZBA-NEXT:    mv a0, a3
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: usubo.i64:
@@ -1163,9 +1171,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZICOND-LABEL: usubo.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
-; RV64ZICOND-NEXT:    sd a1, 0(a2)
+; RV64ZICOND-NEXT:    sltu a3, a1, a0
+; RV64ZICOND-NEXT:    sub a0, a0, a1
+; RV64ZICOND-NEXT:    sd a0, 0(a2)
+; RV64ZICOND-NEXT:    mv a0, a3
 ; RV64ZICOND-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2810,8 +2819,7 @@ entry:
 define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: usubo.select.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    bltu a0, a2, .LBB40_2
+; RV32-NEXT:    bltu a1, a0, .LBB40_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB40_2: # %entry
@@ -2828,8 +2836,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: usubo.select.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a2, a0, a1
-; RV32ZBA-NEXT:    bltu a0, a2, .LBB40_2
+; RV32ZBA-NEXT:    bltu a1, a0, .LBB40_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:  .LBB40_2: # %entry
@@ -2846,8 +2853,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: usubo.select.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a2, a0, a1
-; RV32ZICOND-NEXT:    sltu a2, a0, a2
+; RV32ZICOND-NEXT:    sltu a2, a1, a0
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
@@ -2871,8 +2877,7 @@ entry:
 define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: usubo.not.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
@@ -2885,8 +2890,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: usubo.not.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2899,8 +2903,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: usubo.not.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -2940,8 +2943,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: usubo.select.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    bltu a0, a2, .LBB42_2
+; RV64-NEXT:    bltu a1, a0, .LBB42_2
 ; RV64-NEXT:  # %bb.1: # %entry
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:  .LBB42_2: # %entry
@@ -2969,8 +2971,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: usubo.select.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a2, a0, a1
-; RV64ZBA-NEXT:    bltu a0, a2, .LBB42_2
+; RV64ZBA-NEXT:    bltu a1, a0, .LBB42_2
 ; RV64ZBA-NEXT:  # %bb.1: # %entry
 ; RV64ZBA-NEXT:    mv a0, a1
 ; RV64ZBA-NEXT:  .LBB42_2: # %entry
@@ -2998,8 +2999,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: usubo.select.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a2, a0, a1
-; RV64ZICOND-NEXT:    sltu a2, a0, a2
+; RV64ZICOND-NEXT:    sltu a2, a1, a0
 ; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
 ; RV64ZICOND-NEXT:    or a0, a0, a1
@@ -3030,8 +3030,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: usubo.not.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
 ;
@@ -3053,8 +3052,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: usubo.not.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a1, a0
 ; RV64ZBA-NEXT:    xori a0, a0, 1
 ; RV64ZBA-NEXT:    ret
 ;
@@ -3075,8 +3073,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: usubo.not.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a1, a0
 ; RV64ZICOND-NEXT:    xori a0, a0, 1
 ; RV64ZICOND-NEXT:    ret
 entry:
@@ -4379,8 +4376,7 @@ continue:
 define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: usubo.br.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    sub a1, a0, a1
-; RV32-NEXT:    bgeu a0, a1, .LBB58_2
+; RV32-NEXT:    bgeu a1, a0, .LBB58_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -4401,8 +4397,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: usubo.br.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    sub a1, a0, a1
-; RV32ZBA-NEXT:    bgeu a0, a1, .LBB58_2
+; RV32ZBA-NEXT:    bgeu a1, a0, .LBB58_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
@@ -4423,8 +4418,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: usubo.br.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    sub a1, a0, a1
-; RV32ZICOND-NEXT:    bgeu a0, a1, .LBB58_2
+; RV32ZICOND-NEXT:    bgeu a1, a0, .LBB58_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
@@ -4478,8 +4472,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: usubo.br.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64-NEXT:    bgeu a1, a0, .LBB59_2
 ; RV64-NEXT:  # %bb.1: # %overflow
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
@@ -4509,8 +4502,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: usubo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    sub a1, a0, a1
-; RV64ZBA-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64ZBA-NEXT:    bgeu a1, a0, .LBB59_2
 ; RV64ZBA-NEXT:  # %bb.1: # %overflow
 ; RV64ZBA-NEXT:    li a0, 0
 ; RV64ZBA-NEXT:    ret
@@ -4540,8 +4532,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: usubo.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    sub a1, a0, a1
-; RV64ZICOND-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB59_2
 ; RV64ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV64ZICOND-NEXT:    li a0, 0
 ; RV64ZICOND-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index 3bbf33328f529..6d5fc765c49a8 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -71,10 +71,10 @@ define i32 @subsat(i32 %a, i32 %b) {
 define i32 @subusat(i32 %a, i32 %b) {
 ; RV32I-LABEL: subusat:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    sub a1, a0, a1
-; RV32I-NEXT:    sltu a0, a0, a1
+; RV32I-NEXT:    sub a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IXQCIA-LABEL: subusat:
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index fc73ce5503ffe..da2123a5dfe74 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 ; CHECK-LABEL: @extract_value_usub(
 ; CHECK-NEXT:    [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT:    [[SUB:%.*]] = xor i8 [[ZZ]], -1
-; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[ZZ]], -1
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %z = add nuw i8 %zz, 1
   %y = add i8 %x, %z
@@ -1090,11 +1090,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
 define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
 ; CHECK-LABEL: @extract_value_usub_fail(
 ; CHECK-NEXT:    [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 0, [[Z]]
-; CHECK-NEXT:    [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT:    [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
 ; CHECK-NEXT:    call void @use.i1(i1 [[UOV]])
 ; CHECK-NEXT:    call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[Z]], 0
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[SUB]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
index 62a332e14b04a..3224b8b63afd3 100644
--- a/llvm/test/Transforms/InstCombine/pr170634.ll
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -3,12 +3,13 @@
 define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
 ; CHECK-LABEL: @func(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br label [[RETURN:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
 ; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index f8b318bc3680a..30a5072c7edc8 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
 
 define i1 @t2(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
 
 define i1 @t2_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
 
 define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
 
 define i1 @t8(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
 
 define i1 @t8_logical(i8 %base, i8 %offset) {
 ; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT:    [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT:    [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
 ; CHECK-NEXT:    call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT:    [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT:    [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index c9030e5ab0321..90ca39a70a0bb 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,10 +175,11 @@ define i32 @test7(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -204,10 +205,11 @@ define i32 @test8(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -294,10 +296,11 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -325,10 +328,11 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -356,10 +360,11 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -387,10 +392,11 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -418,10 +424,11 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
@@ -449,10 +456,11 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
 ; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT:    [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
 ; CHECK-NEXT:    br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
 ; CHECK-NEXT:    ret i32 [[R1]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index e4b9c0e08ba22..2074190a2cd45 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,9 +130,10 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
 ; CHECK-NEXT:    ret i1 [[EQ1]]
 ;
@@ -148,9 +149,10 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
 
 define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
 ; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
 ; CHECK-NEXT:    call void @use(i1 [[OV]])
+; CHECK-NEXT:    [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
 ; CHECK-NEXT:    [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
 ; CHECK-NEXT:    ret i1 [[SGT0]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 0c82bdc256ddf..09ef32262ea78 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -506,10 +506,7 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 
 define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
 ; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
-; CHECK-NEXT:    [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
+; CHECK-NEXT:    [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
 ; CHECK-NEXT:    ret { i32, i1 } [[A]]
 ;
   %a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)

>From 42078e78872130250e5d2cb56d44cbf979277124 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sun, 7 Dec 2025 00:22:30 +0530
Subject: [PATCH 5/8] updated testcase

---
 .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 166 ++--
 .../CodeGen/AMDGPU/a-v-global-atomicrmw.ll    |  24 +-
 llvm/test/CodeGen/AMDGPU/addsub64_carry.ll    |  85 +-
 .../test/CodeGen/AMDGPU/carryout-selection.ll | 889 ++++++++++++++++--
 llvm/test/CodeGen/AMDGPU/usubo.ll             | 196 ++--
 llvm/test/CodeGen/AMDGPU/usubsat.ll           |  54 +-
 llvm/test/CodeGen/ARM/addsubo-legalization.ll |  22 +-
 .../LoongArch/atomicrmw-cond-sub-clamp.ll     |  24 +-
 .../PowerPC/atomicrmw-cond-sub-clamp.ll       |  23 +-
 llvm/test/CodeGen/RISCV/pr170634.ll           |  22 +
 .../CodeGen/Thumb2/mve-saturating-arith.ll    |  40 +-
 .../VE/Scalar/atomicrmw-cond-sub-clamp.ll     |  20 +-
 .../WebAssembly/atomicrmw-cond-sub-clamp.ll   |  40 +-
 llvm/test/CodeGen/X86/combine-addo.ll         |  13 +-
 llvm/test/CodeGen/X86/combine-subo.ll         |   4 +-
 llvm/test/CodeGen/X86/vec_usubo.ll            | 741 ++++++++-------
 llvm/test/Transforms/InstCombine/pr170634.ll  |  34 -
 17 files changed, 1574 insertions(+), 823 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/pr170634.ll
 delete mode 100644 llvm/test/Transforms/InstCombine/pr170634.ll

diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index b8962fa29e8f1..67c053ce2ba1a 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -8742,30 +8742,31 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def a[0:1]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB113_4
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:  .LBB113_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v6
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
@@ -8776,26 +8777,27 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB113_4: ; %Flow3
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB113_6
 ; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v6
+; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX90A-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB113_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -8809,30 +8811,31 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_mov_b64 s[2:3], 0x50
 ; GFX950-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; def a[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
-; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
-; GFX950-NEXT:    v_accvgpr_read_b32 v7, a1
-; GFX950-NEXT:    v_accvgpr_read_b32 v6, a0
+; GFX950-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v7
+; GFX950-NEXT:    v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GFX950-NEXT:    ; implicit-def: $agpr0_agpr1
 ; GFX950-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX950-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX950-NEXT:    s_cbranch_execz .LBB113_4
 ; GFX950-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
 ; GFX950-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX950-NEXT:  .LBB113_2: ; %atomicrmw.start
 ; GFX950-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v6
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
+; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX950-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX950-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
@@ -8843,26 +8846,26 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
 ; GFX950-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX950-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-NEXT:  .LBB113_4: ; %Flow3
 ; GFX950-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; GFX950-NEXT:    s_cbranch_execz .LBB113_6
 ; GFX950-NEXT:  ; %bb.5: ; %atomicrmw.private
-; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX950-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v6
+; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v4
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
+; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
 ; GFX950-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT:    scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT:    scratch_store_dwordx2 v6, v[2:3], off
 ; GFX950-NEXT:  .LBB113_6: ; %atomicrmw.phi
 ; GFX950-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX950-NEXT:    ;;#ASMSTART
@@ -8881,28 +8884,29 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x50, v0
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
 ; GFX90A-NEXT:    ;;#ASMSTART
-; GFX90A-NEXT:    ; def v[6:7]
+; GFX90A-NEXT:    ; def v[4:5]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB114_4
 ; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:  .LBB114_2: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v6
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
@@ -8911,22 +8915,23 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB114_2
 ; GFX90A-NEXT:  ; %bb.3: ; %Flow
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:  .LBB114_4: ; %Flow3
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB114_6
 ; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
+; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
 ; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v6
+; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX90A-NEXT:    v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
 ; GFX90A-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB114_6: ; %atomicrmw.phi
@@ -8960,9 +8965,10 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8986,6 +8992,7 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
@@ -17059,8 +17066,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17079,19 +17087,20 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
 ; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX90A-NEXT:    s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v4
+; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX90A-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB221_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a[0:1]
@@ -17124,9 +17133,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17150,11 +17160,11 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
 ; GFX950-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX950-NEXT:    v_accvgpr_write_b32 a1, v1
 ; GFX950-NEXT:    scratch_store_dwordx2 off, v[2:3], s0
 ; GFX950-NEXT:  .LBB221_6: ; %atomicrmw.phi
 ; GFX950-NEXT:    ;;#ASMSTART
@@ -17192,8 +17202,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17216,9 +17227,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(1)
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, v0, v4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, 0, vcc
 ; GFX90A-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB222_6: ; %atomicrmw.phi
@@ -17251,9 +17263,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17275,6 +17288,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index b6fe0c756a106..53270d0c36ae1 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -5810,8 +5810,9 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v2, vcc, v4, v6
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5844,9 +5845,10 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v4, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX950-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5883,8 +5885,9 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v2, vcc, v4, v6
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5913,9 +5916,10 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
 ; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v4, v6
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX950-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -11575,8 +11579,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11610,9 +11615,10 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
 ; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11650,8 +11656,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11681,9 +11688,10 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %
 ; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v4
 ; GFX950-NEXT:    s_nop 1
 ; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX950-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index 8088c1b4c8fc7..46f1662b417bb 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -57,15 +57,17 @@ define <2 x i64> @v_usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
 ; CHECK-LABEL: v_usub_v2i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_sub_co_u32_e32 v6, vcc, v2, v6
-; CHECK-NEXT:    v_sub_co_u32_e64 v4, s[4:5], v0, v4
-; CHECK-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
-; CHECK-NEXT:    v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT:    v_sub_co_u32_e32 v12, vcc, v2, v6
+; CHECK-NEXT:    v_subb_co_u32_e32 v13, vcc, v3, v7, vcc
+; CHECK-NEXT:    v_sub_co_u32_e32 v10, vcc, v0, v4
+; CHECK-NEXT:    v_subb_co_u32_e32 v11, vcc, v1, v5, vcc
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[10:13]
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[6:7]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
-; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -138,9 +140,10 @@ define i64 @v_usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
 ; CHECK-LABEL: v_usub_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_subrev_co_u32_e32 v0, vcc, 1, v0
-; CHECK-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, -1, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, 1, v[0:1]
+; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -157,11 +160,11 @@ define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
 ; CHECK-LABEL: v_usub_n1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_subrev_co_u32_e32 v0, vcc, -1, v0
-; CHECK-NEXT:    v_subbrev_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
@@ -225,20 +228,24 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_sub_u32 s6, s2, s6
-; CHECK-NEXT:    s_subb_u32 s7, s3, s7
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_sub_u32 s0, s0, s4
-; CHECK-NEXT:    s_subb_u32 s1, s1, s5
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
-; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s6
-; CHECK-NEXT:    v_mov_b32_e32 v5, s7
+; CHECK-NEXT:    v_mov_b32_e32 v9, s7
+; CHECK-NEXT:    v_mov_b32_e32 v8, s6
+; CHECK-NEXT:    v_mov_b32_e32 v7, s5
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[8:9]
+; CHECK-NEXT:    v_mov_b32_e32 v6, s4
+; CHECK-NEXT:    s_sub_u32 s8, s2, s6
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[6:7]
+; CHECK-NEXT:    s_subb_u32 s9, s3, s7
+; CHECK-NEXT:    s_sub_u32 s10, s0, s4
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; CHECK-NEXT:    s_subb_u32 s11, s1, s5
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v6
+; CHECK-NEXT:    v_readfirstlane_b32 s2, v8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    v_mov_b32_e32 v4, s8
+; CHECK-NEXT:    v_mov_b32_e32 v5, s9
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -322,11 +329,11 @@ define amdgpu_ps i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_p1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_subb_u32 s1, s1, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_add_u32 s2, s0, -1
+; CHECK-NEXT:    s_addc_u32 s3, s1, -1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 1
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -344,15 +351,13 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_n1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_sub_u32 s0, s0, -1
-; CHECK-NEXT:    s_subb_u32 s1, s1, -1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_add_u32 s0, s0, 1
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 163d7ff9c61fc..19b801a840ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -128,6 +128,31 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: sadd64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %add = add i64 %a, %b
   store i64 %add, ptr addrspace(1) %out
@@ -238,6 +263,30 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: sadd64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %add = add i64 20015998343286, %a
   store i64 %add, ptr addrspace(1) %out
@@ -340,6 +389,29 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: vadd64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -442,6 +514,26 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: vadd64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE2]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -550,6 +642,24 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: suaddo32
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+; GCN-ISEL-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -686,6 +796,35 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1250-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: uaddo32_vcc_user
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
+; GCN-ISEL-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_ADD_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADD_CO_U32_e64_1]], implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -832,6 +971,38 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
 ; GFX1250-NEXT:    global_store_b8 v2, v3, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: suaddo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+; GCN-ISEL-NEXT:   [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO killed [[COPY10]], killed [[COPY12]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO killed [[COPY9]], killed [[COPY11]], killed [[S_UADDO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_UADDO]], %subreg.sub0, killed [[S_ADD_C]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE4]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY13]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_ADD_C1]], implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -978,6 +1149,40 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: vuaddo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY3]], [[COPY1]](s32), 0, implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
+; GCN-ISEL-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY12]], [[COPY13]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADDC_U32_e64_1]], implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -1105,6 +1310,31 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: ssub64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %sub = sub i64 %a, %b
   store i64 %sub, ptr addrspace(1) %out
@@ -1215,6 +1445,30 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: ssub64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %sub = sub i64 20015998343286, %a
   store i64 %sub, ptr addrspace(1) %out
@@ -1317,6 +1571,29 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1]
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: vsub64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1419,6 +1696,26 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: vsub64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1528,6 +1825,24 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: susubo32
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+; GCN-ISEL-NEXT:   [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -1664,6 +1979,35 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1250-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: usubo32_vcc_user
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
+; GCN-ISEL-NEXT:   [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_SUB_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_SUB_CO_U32_e64_1]], implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -1685,20 +2029,21 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s11, 0xf000
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
-; CISI-NEXT:    s_sub_u32 s4, s4, s6
-; CISI-NEXT:    s_subb_u32 s5, s5, s7
+; CISI-NEXT:    v_mov_b32_e32 v0, s6
+; CISI-NEXT:    v_mov_b32_e32 v1, s7
+; CISI-NEXT:    s_sub_u32 s6, s4, s6
+; CISI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; CISI-NEXT:    s_subb_u32 s7, s5, s7
+; CISI-NEXT:    v_mov_b32_e32 v2, s6
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
-; CISI-NEXT:    v_mov_b32_e32 v0, s4
-; CISI-NEXT:    v_mov_b32_e32 v1, s5
-; CISI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; CISI-NEXT:    s_mov_b32 s0, s2
 ; CISI-NEXT:    s_mov_b32 s1, s3
 ; CISI-NEXT:    s_mov_b32 s2, s10
 ; CISI-NEXT:    s_mov_b32 s3, s11
-; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; CISI-NEXT:    s_waitcnt expcnt(0)
-; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CISI-NEXT:    v_mov_b32_e32 v3, s7
+; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
@@ -1707,16 +2052,18 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    s_subb_u32 s1, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1725,12 +2072,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-NEXT:    s_sub_u32 s0, s12, s14
 ; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    global_store_byte v2, v3, s[10:11]
 ; GFX9-NEXT:    s_endpgm
@@ -1743,8 +2092,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    s_sub_u32 s0, s12, s14
 ; GFX1010-NEXT:    s_subb_u32 s1, s13, s15
 ; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-NEXT:    v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX1010-NEXT:    global_store_byte v2, v3, s[10:11]
@@ -1755,11 +2104,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_subb_u32 s5, s5, s7
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030W32-NEXT:    s_sub_u32 s8, s4, s6
+; GFX1030W32-NEXT:    v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
+; GFX1030W32-NEXT:    s_subb_u32 s9, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
@@ -1770,11 +2119,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_subb_u32 s5, s5, s7
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX1030W64-NEXT:    s_sub_u32 s8, s4, s6
+; GFX1030W64-NEXT:    s_subb_u32 s9, s5, s7
+; GFX1030W64-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[4:5], s[6:7]
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
@@ -1784,11 +2133,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_subb_u32 s5, s5, s7
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_sub_u32 s8, s4, s6
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
+; GFX11-NEXT:    s_subb_u32 s9, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s8
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -1799,17 +2148,51 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_sub_co_ci_u32 s1, s13, s15
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    s_sub_nc_u64 s[0:1], s[12:13], s[14:15]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT:    v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
 ; GFX1250-NEXT:    global_store_b8 v2, v3, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: susubo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE5]]
+; GCN-ISEL-NEXT:   [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[COPY13]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY14]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -1829,21 +2212,22 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI:       ; %bb.0:
 ; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; CISI-NEXT:    v_mov_b32_e32 v1, 0
 ; CISI-NEXT:    s_mov_b32 s7, 0xf000
 ; CISI-NEXT:    s_mov_b32 s6, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_mov_b32 s4, s0
-; CISI-NEXT:    v_mov_b32_e32 v1, s9
-; CISI-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; CISI-NEXT:    v_mov_b32_e32 v3, s9
+; CISI-NEXT:    v_sub_i32_e32 v2, vcc, s8, v0
+; CISI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; CISI-NEXT:    v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
 ; CISI-NEXT:    s_mov_b32 s5, s1
-; CISI-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CISI-NEXT:    s_mov_b32 s0, s2
 ; CISI-NEXT:    s_mov_b32 s1, s3
 ; CISI-NEXT:    s_mov_b32 s2, s6
 ; CISI-NEXT:    s_mov_b32 s3, s7
-; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CISI-NEXT:    s_waitcnt expcnt(0)
 ; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
@@ -1851,31 +2235,34 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_mov_b32_e32 v6, s5
-; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v0
-; VI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s2
-; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, s4, v0
+; VI-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    flat_store_dwordx2 v[1:2], v[5:6]
-; VI-NEXT:    flat_store_byte v[3:4], v0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[6:7]
+; VI-NEXT:    flat_store_byte v[4:5], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: vusubo64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
+; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: vusubo64:
@@ -1883,13 +2270,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    s_clause 0x1
 ; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_sub_co_u32 v0, s4, s6, v0
-; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX1010-NEXT:    v_sub_co_u32 v2, s4, s6, v0
+; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v3, s4, s7, 0, s4
+; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1010-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX1010-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: vusubo64:
@@ -1897,13 +2285,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    s_clause 0x1
 ; GFX1030W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT:    v_sub_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX1030W32-NEXT:    v_sub_co_u32 v2, s4, s6, v0
+; GFX1030W32-NEXT:    v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
+; GFX1030W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1030W32-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX1030W32-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX1030W32-NEXT:    s_endpgm
 ;
 ; GFX1030W64-LABEL: vusubo64:
@@ -1911,13 +2300,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    s_clause 0x1
 ; GFX1030W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT:    v_sub_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT:    v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
-; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX1030W64-NEXT:    v_sub_co_u32 v2, s[4:5], s6, v0
+; GFX1030W64-NEXT:    v_sub_co_ci_u32_e64 v3, null, s7, 0, s[4:5]
+; GFX1030W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX1030W64-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX1030W64-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX1030W64-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: vusubo64:
@@ -1925,17 +2315,16 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_co_u32 v0, s4, s6, v0
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX11-NEXT:    v_sub_co_u32 v2, s4, s6, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: vusubo64:
@@ -1944,18 +2333,50 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_sub_co_u32 v0, s4, s6, v0
-; GFX1250-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT:    global_store_b8 v1, v0, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
+; GCN-ISEL-LABEL: name: vusubo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit $exec
+; GCN-ISEL-NEXT:   [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
+; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -3205,6 +3626,292 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:  .LBB16_4:
 ; GFX1250-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX1250-NEXT:    s_branch .LBB16_2
+; GCN-ISEL-LABEL: name: sudiv64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT:   successors: %bb.3(0x50000000), %bb.1(0x30000000)
+; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_192 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3, [[COPY2]], %subreg.sub4, [[COPY1]], %subreg.sub5
+; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sgpr_192 = COPY [[REG_SEQUENCE]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]]
+; GCN-ISEL-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_]], %subreg.sub0, killed [[COPY10]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
+; GCN-ISEL-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 killed [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+; GCN-ISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; GCN-ISEL-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[V_CMP_NE_U64_e64_]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   $vcc = COPY [[S_AND_B64_]]
+; GCN-ISEL-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+; GCN-ISEL-NEXT:   S_BRANCH %bb.1
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT: bb.1.Flow:
+; GCN-ISEL-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.0, %6, %bb.3
+; GCN-ISEL-NEXT:   [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_1]], %bb.0, %40, %bb.3
+; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI1]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
+; GCN-ISEL-NEXT:   S_CMP_LG_U32 killed [[COPY12]], killed [[S_MOV_B32_1]], implicit-def $scc
+; GCN-ISEL-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit $scc
+; GCN-ISEL-NEXT:   S_BRANCH %bb.2
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT: bb.2 (%ir-block.7):
+; GCN-ISEL-NEXT:   successors: %bb.4(0x80000000)
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
+; GCN-ISEL-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT:   [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[S_MOV_B32_2]], [[COPY13]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY13]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[V_RCP_IFLAG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 killed [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1333788670, killed [[V_RCP_IFLAG_F32_e32_]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed [[V_MUL_F32_e32_]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_SUB_I32_]], [[COPY15]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e32_]], killed [[S_MUL_I32_]], implicit $exec
+; GCN-ISEL-NEXT:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
+; GCN-ISEL-NEXT:   [[COPY17:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_]]
+; GCN-ISEL-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY16]], killed [[COPY17]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY killed [[S_ADD_I32_]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY14]], [[COPY18]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+; GCN-ISEL-NEXT:   [[COPY19:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
+; GCN-ISEL-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY19]], [[S_MOV_B32_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY20:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_1:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY20]], [[COPY13]]
+; GCN-ISEL-NEXT:   [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY14]], killed [[S_MUL_I32_1]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_SUB_I32_1]], [[COPY13]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_SUB_I32_1]], [[COPY13]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_SUB_I32_2]], [[S_SUB_I32_1]], implicit $scc
+; GCN-ISEL-NEXT:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_1]], [[COPY21]], implicit $scc
+; GCN-ISEL-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_CSELECT_B32_1]], [[S_MOV_B32_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   S_CMP_GE_U32 killed [[S_CSELECT_B32_]], [[COPY13]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_2]], [[S_CSELECT_B32_1]], implicit $scc
+; GCN-ISEL-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_2]], %subreg.sub0, killed [[S_MOV_B32_4]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY22:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE4]]
+; GCN-ISEL-NEXT:   S_BRANCH %bb.4
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT: bb.3 (%ir-block.12):
+; GCN-ISEL-NEXT:   successors: %bb.1(0x80000000)
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT:   [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
+; GCN-ISEL-NEXT:   [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY23]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub1
+; GCN-ISEL-NEXT:   [[V_CVT_F32_U32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY24]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 1333788672
+; GCN-ISEL-NEXT:   [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_CVT_F32_U32_e64_1]], 0, killed [[S_MOV_B32_5]], 0, killed [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 1602224124
+; GCN-ISEL-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_6]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 796917760
+; GCN-ISEL-NEXT:   [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MUL_F32_e64_]], 0, killed [[S_MOV_B32_7]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 -813694976
+; GCN-ISEL-NEXT:   [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, killed [[S_MOV_B32_8]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[S_MOV_B64_2]], [[COPY9]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY25:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub1
+; GCN-ISEL-NEXT:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_2:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[COPY26]]
+; GCN-ISEL-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub0
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[V_CVT_U32_F32_e64_]], implicit $exec
+; GCN-ISEL-NEXT:   [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_3:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY28]]
+; GCN-ISEL-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_2]]
+; GCN-ISEL-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY29]], killed [[S_MUL_I32_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_3]], killed [[S_MUL_I32_2]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_ADD_I32_4]], implicit $exec
+; GCN-ISEL-NEXT:   [[COPY30:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_4:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY30]], [[S_ADD_I32_4]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_4]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_3]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_5:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY31]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_MUL_I32_5]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_4]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE6]], killed [[REG_SEQUENCE5]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub0
+; GCN-ISEL-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub1
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_ADD_I32_4]], implicit $exec
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_MUL_I32_5]], implicit $exec
+; GCN-ISEL-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_6:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY34]], [[S_MUL_I32_5]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_6]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_6]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub0
+; GCN-ISEL-NEXT:   [[COPY36:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY32]], killed [[COPY35]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY33]], killed [[COPY36]], implicit-def $scc, implicit $scc
+; GCN-ISEL-NEXT:   [[COPY37:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_5]]
+; GCN-ISEL-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY37]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
+; GCN-ISEL-NEXT:   [[COPY38:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT:   [[S_MUL_I32_7:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY38]], [[S_ADD_I32_4]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE8:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_7]], %subreg.sub0, killed [[S_ADDC_U32_1]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE9:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_]], %subreg.sub0, killed [[S_ADDC_U32_]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY39:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE9]].sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY39]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U1:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE10]], killed [[REG_SEQUENCE8]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub0
+; GCN-ISEL-NEXT:   [[COPY41:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT:   [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[COPY41]], killed [[COPY40]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY42:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub1
+; GCN-ISEL-NEXT:   [[COPY43:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT:   [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[COPY43]], killed [[COPY42]], killed [[S_UADDO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_MUL_I32_8:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_ADD_C]]
+; GCN-ISEL-NEXT:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[COPY44]], implicit $exec
+; GCN-ISEL-NEXT:   [[COPY45:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_7]]
+; GCN-ISEL-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY45]], killed [[S_MUL_I32_8]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_MUL_I32_9:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[S_UADDO]]
+; GCN-ISEL-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_5]], killed [[S_MUL_I32_9]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY46]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MUL_I32_10:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_UADDO]]
+; GCN-ISEL-NEXT:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY47]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MUL_I32_11:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_MUL_I32_10]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE11:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_11]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_9]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub0
+; GCN-ISEL-NEXT:   [[COPY49:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub1
+; GCN-ISEL-NEXT:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY50]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MUL_I32_12:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_UADDO]], [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE12:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_12]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_10]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_11:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY51]], implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE13:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_11]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U2:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE13]], killed [[REG_SEQUENCE12]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY52:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub0
+; GCN-ISEL-NEXT:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY52]], killed [[COPY48]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY53]], killed [[COPY49]], implicit-def $scc, implicit $scc
+; GCN-ISEL-NEXT:   [[COPY54:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_8]]
+; GCN-ISEL-NEXT:   [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY54]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
+; GCN-ISEL-NEXT:   [[S_MUL_I32_13:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE14:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_13]], %subreg.sub0, killed [[S_ADDC_U32_3]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE15:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_1]], %subreg.sub0, killed [[S_ADDC_U32_2]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY55:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE15]].sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE16:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY55]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U3:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE16]], killed [[REG_SEQUENCE14]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY56:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub0
+; GCN-ISEL-NEXT:   [[S_UADDO2:%[0-9]+]]:sreg_32, [[S_UADDO3:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[S_UADDO]], killed [[COPY56]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY57:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub1
+; GCN-ISEL-NEXT:   [[S_ADD_C2:%[0-9]+]]:sreg_32, [[S_ADD_C3:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[S_ADD_C]], killed [[COPY57]], killed [[S_UADDO3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY58:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+; GCN-ISEL-NEXT:   [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_12:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY59]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MUL_I32_14:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY58]], [[S_ADD_C2]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE17:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_14]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_12]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_13:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY60]], implicit $exec
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE18:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_13]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U4:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE18]], killed [[REG_SEQUENCE17]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY61:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub0
+; GCN-ISEL-NEXT:   [[COPY62:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub1
+; GCN-ISEL-NEXT:   [[COPY63:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
+; GCN-ISEL-NEXT:   [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_14:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY64]], implicit $exec
+; GCN-ISEL-NEXT:   [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_15:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY65]], implicit $exec
+; GCN-ISEL-NEXT:   [[S_MUL_I32_15:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_UADDO2]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE19:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_15]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_15]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY66:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub0
+; GCN-ISEL-NEXT:   [[COPY67:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY61]], killed [[COPY66]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY62]], killed [[COPY67]], implicit-def $scc, implicit $scc
+; GCN-ISEL-NEXT:   [[COPY68:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_14]]
+; GCN-ISEL-NEXT:   [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY68]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
+; GCN-ISEL-NEXT:   [[S_MUL_I32_16:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_ADD_C2]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE20:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_16]], %subreg.sub0, killed [[S_ADDC_U32_5]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE21:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_2]], %subreg.sub0, killed [[S_ADDC_U32_4]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY69:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE21]].sub1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE22:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY69]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_ADD_U5:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE22]], killed [[REG_SEQUENCE20]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY70:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub1
+; GCN-ISEL-NEXT:   [[S_MUL_I32_17:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY70]]
+; GCN-ISEL-NEXT:   [[COPY71:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub0
+; GCN-ISEL-NEXT:   [[COPY72:%[0-9]+]]:vgpr_32 = COPY [[COPY71]]
+; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_16:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY23]], [[COPY72]], implicit $exec
+; GCN-ISEL-NEXT:   [[COPY73:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_16]]
+; GCN-ISEL-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY73]], killed [[S_MUL_I32_17]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_MUL_I32_18:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY24]], [[COPY71]]
+; GCN-ISEL-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_7]], killed [[S_MUL_I32_18]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_SUB_I32_3:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY63]], [[S_ADD_I32_8]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_MUL_I32_19:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY71]]
+; GCN-ISEL-NEXT:   [[S_USUBO:%[0-9]+]]:sreg_32, [[S_USUBO1:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[COPY58]], killed [[S_MUL_I32_19]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_SUB_C:%[0-9]+]]:sreg_32, [[S_SUB_C1:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_I32_3]], [[COPY24]], [[S_USUBO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_USUBO2:%[0-9]+]]:sreg_32, [[S_USUBO3:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[S_USUBO]], [[COPY23]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[S_SUB_C2:%[0-9]+]]:sreg_32, [[S_SUB_C3:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_C]], [[S_MOV_B32_10]], killed [[S_USUBO3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_3:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT:   S_CMP_GE_U32 killed [[S_USUBO2]], [[COPY23]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_4:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT:   S_CMP_EQ_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_5:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_4]], killed [[S_CSELECT_B32_3]], implicit $scc
+; GCN-ISEL-NEXT:   [[COPY74:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_5]]
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE23:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY71]], %subreg.sub0, [[COPY70]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+; GCN-ISEL-NEXT:   [[S_ADD_U6:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY75:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B64_4:%[0-9]+]]:sreg_64 = S_MOV_B64 2
+; GCN-ISEL-NEXT:   [[S_ADD_U7:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_4]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   [[COPY76:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub0
+; GCN-ISEL-NEXT:   S_CMP_LG_U32 killed [[COPY74]], [[S_MOV_B32_10]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_6:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY76]], killed [[COPY75]], implicit $scc
+; GCN-ISEL-NEXT:   [[COPY77:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub1
+; GCN-ISEL-NEXT:   [[COPY78:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub1
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_7:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY78]], killed [[COPY77]], implicit $scc
+; GCN-ISEL-NEXT:   [[S_SUB_C4:%[0-9]+]]:sreg_32, [[S_SUB_C5:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO [[COPY63]], [[S_ADD_I32_8]], [[S_USUBO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_8:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_USUBO]], [[COPY23]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_9:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT:   S_CMP_EQ_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_10:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_9]], killed [[S_CSELECT_B32_8]], implicit $scc
+; GCN-ISEL-NEXT:   [[COPY79:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_10]]
+; GCN-ISEL-NEXT:   S_CMP_LG_U32 killed [[COPY79]], [[S_MOV_B32_10]], implicit-def $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_11:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_7]], [[COPY70]], implicit $scc
+; GCN-ISEL-NEXT:   [[S_CSELECT_B32_12:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_6]], [[COPY71]], implicit $scc
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE24:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_12]], %subreg.sub0, killed [[S_CSELECT_B32_11]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[S_MOV_B64_5:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+; GCN-ISEL-NEXT:   [[COPY80:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE24]]
+; GCN-ISEL-NEXT:   S_BRANCH %bb.1
+; GCN-ISEL-NEXT: {{  $}}
+; GCN-ISEL-NEXT: bb.4 (%ir-block.14):
+; GCN-ISEL-NEXT:   [[PHI2:%[0-9]+]]:sreg_64 = PHI [[PHI]], %bb.1, [[COPY22]], %bb.2
+; GCN-ISEL-NEXT:   [[COPY81:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+; GCN-ISEL-NEXT:   [[COPY82:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE25:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY82]], %subreg.sub0, killed [[COPY81]], %subreg.sub1
+; GCN-ISEL-NEXT:   [[COPY83:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub1
+; GCN-ISEL-NEXT:   [[COPY84:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub0
+; GCN-ISEL-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT:   [[REG_SEQUENCE26:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY84]], %subreg.sub0, killed [[COPY83]], %subreg.sub1, killed [[S_MOV_B32_13]], %subreg.sub2, killed [[S_MOV_B32_12]], %subreg.sub3
+; GCN-ISEL-NEXT:   [[COPY85:%[0-9]+]]:vreg_64 = COPY [[PHI2]]
+; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET [[COPY85]], killed [[REG_SEQUENCE26]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.16, addrspace 1)
+; GCN-ISEL-NEXT:   S_ENDPGM 0
   %result = udiv i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -3225,5 +3932,3 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN-ISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 8a54ad301f48a..7f3c0c0c8605e 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -14,13 +14,15 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_sub_u32 s2, s2, s8
-; SI-NEXT:    s_subb_u32 s3, s3, s9
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
+; SI-NEXT:    s_sub_u32 s0, s2, s8
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; SI-NEXT:    s_subb_u32 s1, s3, s9
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -31,13 +33,15 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_sub_u32 s2, s2, s4
-; VI-NEXT:    s_subb_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT:    s_sub_u32 s0, s2, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT:    s_subb_u32 s1, s3, s5
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
@@ -48,10 +52,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX9-NEXT:    s_sub_u32 s4, s2, s6
 ; GFX9-NEXT:    s_subb_u32 s5, s3, s7
-; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -65,9 +71,9 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s4, s[2:3], s[6:7]
 ; GFX10-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX10-NEXT:    s_subb_u32 s3, s3, s7
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10-NEXT:    v_add_co_u32 v0, s2, s2, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
@@ -81,13 +87,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s6, s[2:3], s[4:5]
 ; GFX11-NEXT:    s_sub_u32 s2, s2, s4
 ; GFX11-NEXT:    s_subb_u32 s3, s3, s5
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -429,20 +434,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_u32 s4, s4, s6
-; SI-NEXT:    s_subb_u32 s5, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_sub_u32 s6, s4, s6
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; SI-NEXT:    s_subb_u32 s7, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    s_mov_b32 s0, s2
 ; SI-NEXT:    s_mov_b32 s1, s3
 ; SI-NEXT:    s_mov_b32 s2, s10
 ; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -451,16 +457,18 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    s_subb_u32 s1, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v6, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -469,12 +477,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-NEXT:    s_sub_u32 s0, s12, s14
 ; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    global_store_byte v2, v3, s[10:11]
 ; GFX9-NEXT:    s_endpgm
@@ -487,8 +497,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    s_sub_u32 s0, s12, s14
 ; GFX10-NEXT:    s_subb_u32 s1, s13, s15
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10-NEXT:    global_store_byte v2, v3, s[10:11]
@@ -498,11 +508,11 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_subb_u32 s5, s5, s7
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_sub_u32 s8, s4, s6
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
+; GFX11-NEXT:    s_subb_u32 s9, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s8
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -538,11 +548,11 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -561,59 +571,64 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
-; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT:    v_subb_u32_e32 v9, vcc, v1, v3, vcc
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
 ; VI-NEXT:    flat_store_byte v[6:7], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_usubo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[14:15]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT:    global_store_byte v6, v0, s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_usubo_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[14:15]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX10-NEXT:    global_store_byte v4, v2, s[10:11]
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX10-NEXT:    global_store_byte v6, v0, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_usubo_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
-; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT:    global_store_b8 v4, v2, s[2:3]
+; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1]
+; GFX11-NEXT:    global_store_b8 v6, v0, s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -675,11 +690,11 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v5, vcc, v4, v5
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, v4, v5
+; VI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
-; VI-NEXT:    v_cmp_gt_u32_e32 vcc, v6, v4
-; VI-NEXT:    flat_store_short v[0:1], v5
+; VI-NEXT:    v_cmp_gt_u32_e32 vcc, v4, v5
+; VI-NEXT:    flat_store_short v[0:1], v6
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
@@ -692,10 +707,10 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v2, v1, v2
-; GFX9-NEXT:    v_cmp_gt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_sub_u32_e32 v3, v1, v2
+; GFX9-NEXT:    v_cmp_gt_u32_sdwa s[0:1], v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    global_store_short v0, v2, s[8:9]
+; GFX9-NEXT:    global_store_short v0, v3, s[8:9]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -708,10 +723,10 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v1, v2
-; GFX10-NEXT:    v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT:    v_cmp_gt_u32_sdwa s0, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    global_store_short v0, v2, s[8:9]
+; GFX10-NEXT:    global_store_short v0, v3, s[8:9]
 ; GFX10-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -721,18 +736,19 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[4:5]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_d16_b16 v2, v0, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v1, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-NEXT:    global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 3ddb2f02c48fe..8bd4073e35c74 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -730,38 +730,52 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_usubsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, 0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, 0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, 0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i64:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, 0, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubsat_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, 0, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
 }
diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index 5ebb115791c66..e5789de4ca415 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -49,24 +49,24 @@ define <2 x i1> @usubo(ptr %ptr, ptr %ptr2) {
 ; CHECK-LABEL: usubo:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT:    vsub.i64 q8, q9, q8
-; CHECK-NEXT:    vmov lr, r12, d18
-; CHECK-NEXT:    vmov r4, r5, d19
-; CHECK-NEXT:    vmov r3, r2, d16
-; CHECK-NEXT:    vmov r6, r7, d17
-; CHECK-NEXT:    subs.w r3, lr, r3
-; CHECK-NEXT:    sbcs.w r2, r12, r2
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov r3, r2, d18
+; CHECK-NEXT:    vmov r6, r7, d19
+; CHECK-NEXT:    vmov lr, r12, d16
+; CHECK-NEXT:    vmov r4, r5, d17
+; CHECK-NEXT:    vsub.i64 q8, q8, q9
+; CHECK-NEXT:    subs.w r3, r3, lr
+; CHECK-NEXT:    sbcs.w r2, r2, r12
 ; CHECK-NEXT:    mov.w r2, #0
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r2, #1
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    subs r3, r4, r6
-; CHECK-NEXT:    sbcs.w r3, r5, r7
+; CHECK-NEXT:    subs r3, r6, r4
+; CHECK-NEXT:    sbcs.w r3, r7, r5
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r1, #1
 ; CHECK-NEXT:    cmp r1, #0
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
index ab09cc9ed50a0..81628a15915a2 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
@@ -203,9 +203,9 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    move $a5, $a4
 ; LA64-NEXT:    srl.w $a4, $a4, $a2
 ; LA64-NEXT:    andi $a4, $a4, 255
-; LA64-NEXT:    sub.d $a6, $a4, $a1
-; LA64-NEXT:    sltu $a4, $a4, $a6
-; LA64-NEXT:    masknez $a4, $a6, $a4
+; LA64-NEXT:    sltu $a6, $a1, $a4
+; LA64-NEXT:    sub.d $a4, $a4, $a1
+; LA64-NEXT:    masknez $a4, $a4, $a6
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    and $a6, $a5, $a3
 ; LA64-NEXT:    or $a6, $a6, $a4
@@ -252,9 +252,9 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    move $a5, $a4
 ; LA64-NEXT:    srl.w $a4, $a4, $a2
 ; LA64-NEXT:    bstrpick.d $a4, $a4, 15, 0
-; LA64-NEXT:    sub.d $a6, $a4, $a1
-; LA64-NEXT:    sltu $a4, $a4, $a6
-; LA64-NEXT:    masknez $a4, $a6, $a4
+; LA64-NEXT:    sltu $a6, $a1, $a4
+; LA64-NEXT:    sub.d $a4, $a4, $a1
+; LA64-NEXT:    masknez $a4, $a4, $a6
 ; LA64-NEXT:    sll.w $a4, $a4, $a2
 ; LA64-NEXT:    and $a6, $a5, $a3
 ; LA64-NEXT:    or $a6, $a6, $a4
@@ -292,9 +292,9 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB6_3 Depth 2
 ; LA64-NEXT:    move $a3, $a2
-; LA64-NEXT:    sub.d $a2, $a2, $a1
-; LA64-NEXT:    sltu $a4, $a3, $a2
-; LA64-NEXT:    masknez $a4, $a2, $a4
+; LA64-NEXT:    sltu $a2, $a1, $a2
+; LA64-NEXT:    sub.d $a4, $a3, $a1
+; LA64-NEXT:    masknez $a4, $a4, $a2
 ; LA64-NEXT:  .LBB6_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB6_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
@@ -328,9 +328,9 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB7_3 Depth 2
 ; LA64-NEXT:    move $a3, $a2
-; LA64-NEXT:    sub.d $a2, $a2, $a1
-; LA64-NEXT:    sltu $a4, $a3, $a2
-; LA64-NEXT:    masknez $a4, $a2, $a4
+; LA64-NEXT:    sltu $a2, $a1, $a2
+; LA64-NEXT:    sub.d $a4, $a3, $a1
+; LA64-NEXT:    masknez $a4, $a4, $a2
 ; LA64-NEXT:  .LBB7_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB7_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
index ff176c80ab342..8e22b4eb6bb9d 100644
--- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
@@ -218,16 +218,15 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; CHECK-NEXT:    # Child Loop BB4_4 Depth 2
 ; CHECK-NEXT:    srw 8, 7, 3
 ; CHECK-NEXT:    clrlwi 9, 8, 24
-; CHECK-NEXT:    sub 8, 9, 4
-; CHECK-NEXT:    cmplw 8, 9
-; CHECK-NEXT:    li 9, 0
+; CHECK-NEXT:    cmplw 9, 4
+; CHECK-NEXT:    li 8, 0
 ; CHECK-NEXT:    bgt 0, .LBB4_3
 ; CHECK-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 9, 8
+; CHECK-NEXT:    sub 8, 9, 4
 ; CHECK-NEXT:  .LBB4_3: # %atomicrmw.start
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 8, 9, 3
+; CHECK-NEXT:    slw 8, 8, 3
 ; CHECK-NEXT:    and 9, 7, 6
 ; CHECK-NEXT:    or 9, 9, 8
 ; CHECK-NEXT:  .LBB4_4: # %cmpxchg.start
@@ -277,16 +276,15 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; CHECK-NEXT:    # Child Loop BB5_4 Depth 2
 ; CHECK-NEXT:    srw 8, 7, 3
 ; CHECK-NEXT:    clrlwi 9, 8, 16
-; CHECK-NEXT:    sub 8, 9, 4
-; CHECK-NEXT:    cmplw 8, 9
-; CHECK-NEXT:    li 9, 0
+; CHECK-NEXT:    cmplw 9, 4
+; CHECK-NEXT:    li 8, 0
 ; CHECK-NEXT:    bgt 0, .LBB5_3
 ; CHECK-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 9, 8
+; CHECK-NEXT:    sub 8, 9, 4
 ; CHECK-NEXT:  .LBB5_3: # %atomicrmw.start
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 8, 9, 3
+; CHECK-NEXT:    slw 8, 8, 3
 ; CHECK-NEXT:    and 9, 7, 6
 ; CHECK-NEXT:    or 9, 9, 8
 ; CHECK-NEXT:  .LBB5_4: # %cmpxchg.start
@@ -325,13 +323,12 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB6_3 Depth 2
-; CHECK-NEXT:    sub 5, 6, 4
-; CHECK-NEXT:    cmplw 5, 6
+; CHECK-NEXT:    cmplw 6, 4
 ; CHECK-NEXT:    li 7, 0
 ; CHECK-NEXT:    bgt 0, .LBB6_3
 ; CHECK-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 5
+; CHECK-NEXT:    sub 7, 6, 4
 ; CHECK-NEXT:  .LBB6_3: # %cmpxchg.start
 ; CHECK-NEXT:    # Parent Loop BB6_1 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/RISCV/pr170634.ll b/llvm/test/CodeGen/RISCV/pr170634.ll
new file mode 100644
index 0000000000000..52f011e167e0f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr170634.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv64 -O1 < %s | FileCheck %s
+
+declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64)
+
+; Test that usub.with.overflow generates efficient code without extra mv instruction
+define i64 @test_usubo_no_extra_mv(i64 %x, i64 %y) {
+; CHECK-LABEL: test_usubo_no_extra_mv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bltu a1, a0, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    li a0, 291
+; CHECK-NEXT:    ret
+  %res = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+  %val = extractvalue { i64, i1 } %res, 0
+  %ovf = extractvalue { i64, i1 } %res, 1
+  %ret = select i1 %ovf, i64 291, i64 %val
+  ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index bbc0ff9bd1be5..1e537fe64c08d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -265,28 +265,28 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: usub_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    vmov r0, r8, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    subs r5, r2, r0
-; CHECK-NEXT:    sbc.w lr, r3, r1
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    sbcs.w r2, r3, lr
-; CHECK-NEXT:    vmov r3, r12, d2
-; CHECK-NEXT:    vmov r1, r4, d0
-; CHECK-NEXT:    csetm r2, lo
-; CHECK-NEXT:    subs r3, r1, r3
-; CHECK-NEXT:    sbc.w r0, r4, r12
-; CHECK-NEXT:    subs r1, r1, r3
-; CHECK-NEXT:    sbcs.w r1, r4, r0
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r5
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    vmov r6, r7, d0
+; CHECK-NEXT:    subs.w r12, r2, r0
+; CHECK-NEXT:    sbc.w lr, r3, r8
+; CHECK-NEXT:    subs r1, r6, r4
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r12
+; CHECK-NEXT:    sbc.w r1, r7, r5
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, lr
+; CHECK-NEXT:    sbcs.w r0, r8, r3
+; CHECK-NEXT:    csetm r0, lo
+; CHECK-NEXT:    subs r1, r4, r6
+; CHECK-NEXT:    sbcs.w r1, r5, r7
 ; CHECK-NEXT:    csetm r1, lo
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, lr
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r2
-; CHECK-NEXT:    vbic q0, q1, q0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
diff --git a/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
index 860c4004658db..ad9e8a69cc81f 100644
--- a/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
@@ -138,10 +138,10 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; CHECK-NEXT:    and %s4, %s5, (32)0
 ; CHECK-NEXT:    srl %s4, %s4, %s0
 ; CHECK-NEXT:    and %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s6, %s4, %s3
-; CHECK-NEXT:    cmpu.w %s4, %s6, %s4
-; CHECK-NEXT:    cmov.w.gt %s6, (0)1, %s4
-; CHECK-NEXT:    sla.w.sx %s4, %s6, %s0
+; CHECK-NEXT:    cmpu.w %s6, %s4, %s3
+; CHECK-NEXT:    subs.w.sx %s4, %s4, %s3
+; CHECK-NEXT:    cmov.w.gt %s4, (0)1, %s6
+; CHECK-NEXT:    sla.w.sx %s4, %s4, %s0
 ; CHECK-NEXT:    and %s6, %s5, %s2
 ; CHECK-NEXT:    or %s4, %s6, %s4
 ; CHECK-NEXT:    cas.w %s4, (%s1), %s5
@@ -174,10 +174,10 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; CHECK-NEXT:    and %s4, %s5, (32)0
 ; CHECK-NEXT:    srl %s4, %s4, %s0
 ; CHECK-NEXT:    and %s4, %s4, (48)0
-; CHECK-NEXT:    subs.w.sx %s6, %s4, %s3
-; CHECK-NEXT:    cmpu.w %s4, %s6, %s4
-; CHECK-NEXT:    cmov.w.gt %s6, (0)1, %s4
-; CHECK-NEXT:    sla.w.sx %s4, %s6, %s0
+; CHECK-NEXT:    cmpu.w %s6, %s4, %s3
+; CHECK-NEXT:    subs.w.sx %s4, %s4, %s3
+; CHECK-NEXT:    cmov.w.gt %s4, (0)1, %s6
+; CHECK-NEXT:    sla.w.sx %s4, %s4, %s0
 ; CHECK-NEXT:    and %s6, %s5, %s2
 ; CHECK-NEXT:    or %s4, %s6, %s4
 ; CHECK-NEXT:    cas.w %s4, (%s1), %s5
@@ -200,8 +200,8 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or %s3, 0, %s2
+; CHECK-NEXT:    cmpu.w %s4, %s2, %s1
 ; CHECK-NEXT:    subs.w.sx %s2, %s2, %s1
-; CHECK-NEXT:    cmpu.w %s4, %s2, %s3
 ; CHECK-NEXT:    cmov.w.gt %s2, (0)1, %s4
 ; CHECK-NEXT:    cas.w %s2, (%s0), %s3
 ; CHECK-NEXT:    brne.w %s2, %s3, .LBB6_1
@@ -222,7 +222,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or %s3, 0, %s2
 ; CHECK-NEXT:    subs.l %s2, %s2, %s1
-; CHECK-NEXT:    cmpu.l %s4, %s2, %s3
+; CHECK-NEXT:    cmpu.l %s4, %s3, %s1
 ; CHECK-NEXT:    cmov.l.gt %s2, (0)1, %s4
 ; CHECK-NEXT:    cas.l %s2, (%s0), %s3
 ; CHECK-NEXT:    brne.l %s2, %s3, .LBB7_1
diff --git a/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
index 3355237425b42..e3c5da02ab16b 100644
--- a/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
@@ -189,12 +189,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; WASM32-NEXT:    i32.load8_u 0
 ; WASM32-NEXT:    local.tee 2
 ; WASM32-NEXT:    local.get 1
-; WASM32-NEXT:    i32.const 255
-; WASM32-NEXT:    i32.and
 ; WASM32-NEXT:    i32.sub
-; WASM32-NEXT:    local.tee 1
-; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 255
+; WASM32-NEXT:    i32.and
 ; WASM32-NEXT:    i32.gt_u
 ; WASM32-NEXT:    i32.select
 ; WASM32-NEXT:    i32.store8 0
@@ -211,12 +210,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; WASM64-NEXT:    i32.load8_u 0
 ; WASM64-NEXT:    local.tee 2
 ; WASM64-NEXT:    local.get 1
-; WASM64-NEXT:    i32.const 255
-; WASM64-NEXT:    i32.and
 ; WASM64-NEXT:    i32.sub
-; WASM64-NEXT:    local.tee 1
-; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.const 255
+; WASM64-NEXT:    i32.and
 ; WASM64-NEXT:    i32.gt_u
 ; WASM64-NEXT:    i32.select
 ; WASM64-NEXT:    i32.store8 0
@@ -237,12 +235,11 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; WASM32-NEXT:    i32.load16_u 0
 ; WASM32-NEXT:    local.tee 2
 ; WASM32-NEXT:    local.get 1
-; WASM32-NEXT:    i32.const 65535
-; WASM32-NEXT:    i32.and
 ; WASM32-NEXT:    i32.sub
-; WASM32-NEXT:    local.tee 1
-; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32.const 65535
+; WASM32-NEXT:    i32.and
 ; WASM32-NEXT:    i32.gt_u
 ; WASM32-NEXT:    i32.select
 ; WASM32-NEXT:    i32.store16 0
@@ -259,12 +256,11 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; WASM64-NEXT:    i32.load16_u 0
 ; WASM64-NEXT:    local.tee 2
 ; WASM64-NEXT:    local.get 1
-; WASM64-NEXT:    i32.const 65535
-; WASM64-NEXT:    i32.and
 ; WASM64-NEXT:    i32.sub
-; WASM64-NEXT:    local.tee 1
-; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32.const 65535
+; WASM64-NEXT:    i32.and
 ; WASM64-NEXT:    i32.gt_u
 ; WASM64-NEXT:    i32.select
 ; WASM64-NEXT:    i32.store16 0
@@ -286,9 +282,8 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; WASM32-NEXT:    local.tee 2
 ; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    i32.sub
-; WASM32-NEXT:    local.tee 1
-; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    i32.gt_u
 ; WASM32-NEXT:    i32.select
 ; WASM32-NEXT:    i32.store 0
@@ -306,9 +301,8 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; WASM64-NEXT:    local.tee 2
 ; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    i32.sub
-; WASM64-NEXT:    local.tee 1
-; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    i32.gt_u
 ; WASM64-NEXT:    i32.select
 ; WASM64-NEXT:    i32.store 0
@@ -330,9 +324,8 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; WASM32-NEXT:    local.tee 2
 ; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    i64.sub
-; WASM32-NEXT:    local.tee 1
-; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 1
 ; WASM32-NEXT:    i64.gt_u
 ; WASM32-NEXT:    i64.select
 ; WASM32-NEXT:    i64.store 0
@@ -350,9 +343,8 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; WASM64-NEXT:    local.tee 2
 ; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    i64.sub
-; WASM64-NEXT:    local.tee 1
-; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 1
 ; WASM64-NEXT:    i64.gt_u
 ; WASM64-NEXT:    i64.select
 ; WASM64-NEXT:    i64.store 0
diff --git a/llvm/test/CodeGen/X86/combine-addo.ll b/llvm/test/CodeGen/X86/combine-addo.ll
index ba748b6e653cf..878dee6b2921b 100644
--- a/llvm/test/CodeGen/X86/combine-addo.ll
+++ b/llvm/test/CodeGen/X86/combine-addo.ll
@@ -75,23 +75,12 @@ define i32 @combine_uadd_not(i32 %a0, i32 %a1) {
 define <4 x i32> @combine_vec_uadd_not(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_vec_uadd_not:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    psubd %xmm0, %xmm2
-; SSE-NEXT:    pmovsxbd {{.*#+}} xmm0 = [1,1,1,1]
-; SSE-NEXT:    pmaxud %xmm2, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_uadd_not:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX-NEXT:    vpmaxud %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %1 = xor <4 x i32> %a0, <i32 -1, i32 -1, i32 -1, i32 -1>
   %2 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll
index 5e4bba6e0fd35..f336b08318373 100644
--- a/llvm/test/CodeGen/X86/combine-subo.ll
+++ b/llvm/test/CodeGen/X86/combine-subo.ll
@@ -202,13 +202,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
 ; SSE-LABEL: always_usub_const_vector:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: always_usub_const_vector:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    retq
   %x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 0, i8 0, i8 0, i8 0>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>)
   ret { <4 x i8>, <4 x i1> } %x
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index ceb1ad13bc153..4436732f8d86c 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -43,11 +43,11 @@ define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: usubo_v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    movq %xmm0, (%rdi)
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
@@ -55,43 +55,45 @@ define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3-LABEL: usubo_v2i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; SSSE3-NEXT:    movq %xmm0, (%rdi)
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: usubo_v2i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pminud %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movq %xmm2, (%rdi)
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pminud %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: usubo_v2i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpminud %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm1, (%rdi)
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, (%rdi)
+; AVX-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v2i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpnleud %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    vmovq %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
   %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -105,11 +107,11 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: usubo_v3i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    movq %xmm0, (%rdi)
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    movd %xmm0, 8(%rdi)
@@ -119,11 +121,11 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3-LABEL: usubo_v3i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; SSSE3-NEXT:    movq %xmm0, (%rdi)
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
@@ -132,35 +134,37 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ;
 ; SSE41-LABEL: usubo_v3i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pminud %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    pextrd $2, %xmm2, 8(%rdi)
-; SSE41-NEXT:    movq %xmm2, (%rdi)
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pminud %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: usubo_v3i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpminud %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
-; AVX-NEXT:    vmovq %xmm1, (%rdi)
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
+; AVX-NEXT:    vmovq %xmm0, (%rdi)
+; AVX-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v3i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpnleud %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
-; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    vpextrd $2, %xmm2, 8(%rdi)
+; AVX512-NEXT:    vmovq %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
   %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -174,11 +178,11 @@ define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: usubo_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
@@ -186,43 +190,45 @@ define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3-LABEL: usubo_v4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: usubo_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psubd %xmm1, %xmm2
-; SSE41-NEXT:    pminud %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pminud %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm2
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: usubo_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpminud %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpnleud %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
   %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
@@ -236,13 +242,6 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: usubo_v6i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
 ; SSE2-NEXT:    movd %r8d, %xmm0
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -253,37 +252,37 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm3, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm4, (%rcx)
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    movq %xmm0, 16(%rcx)
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    movq %xmm0, 16(%rdi)
-; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psubd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    psubd %xmm4, %xmm1
+; SSE2-NEXT:    movq %xmm1, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    movq %xmm2, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm5, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: usubo_v6i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movq %rdi, %rax
-; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
 ; SSSE3-NEXT:    movd %r8d, %xmm0
 ; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -294,25 +293,32 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSSE3-NEXT:    movd %r9d, %xmm1
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    psubd %xmm3, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    psubd %xmm2, %xmm0
-; SSSE3-NEXT:    movq %xmm0, 16(%rcx)
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    movq %xmm0, 16(%rdi)
-; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    psubd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    psubd %xmm4, %xmm1
+; SSSE3-NEXT:    movq %xmm1, 16(%rcx)
+; SSSE3-NEXT:    movdqa %xmm0, (%rcx)
+; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm5, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: usubo_v6i32:
@@ -332,61 +338,63 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    psubd %xmm3, %xmm4
-; SSE41-NEXT:    pminud %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    psubd %xmm1, %xmm5
-; SSE41-NEXT:    pminud %xmm5, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm3, %xmm2
-; SSE41-NEXT:    movq %xmm5, 16(%rcx)
-; SSE41-NEXT:    movdqa %xmm4, (%rcx)
-; SSE41-NEXT:    movq %xmm2, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    pminud %xmm3, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pminud %xmm1, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm2, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm0, (%rcx)
+; SSE41-NEXT:    movq %xmm6, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm4, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: usubo_v6i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpminud %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: usubo_v6i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
-; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
+; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v6i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, 16(%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
   %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -401,17 +409,18 @@ define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
 ; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    psubd %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
 ; SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; SSE2-NEXT:    retq
 ;
@@ -419,72 +428,77 @@ define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
 ; SSSE3-NEXT:    psubd %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    psubd %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm1, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT:    psubd %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm5, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: usubo_v8i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    psubd %xmm2, %xmm4
-; SSE41-NEXT:    pminud %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pminud %xmm2, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT:    pxor %xmm6, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
-; SSE41-NEXT:    psubd %xmm3, %xmm5
-; SSE41-NEXT:    pminud %xmm5, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm5, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    pminud %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm6, %xmm5
+; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    psubd %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: usubo_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpminud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpminud %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: usubo_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpminud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
+; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v8i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %ymm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
   %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
@@ -497,158 +511,167 @@ define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: usubo_v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm8
 ; SSE2-NEXT:    psubd %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
 ; SSE2-NEXT:    psubd %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    psubd %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    movdqa %xmm7, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    pxor %xmm9, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE2-NEXT:    psubd %xmm7, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, 48(%rdi)
 ; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
-; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    psubd %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm3, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
-; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
-; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: usubo_v16i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    pxor %xmm8, %xmm9
+; SSSE3-NEXT:    movdqa %xmm3, %xmm9
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm8
 ; SSSE3-NEXT:    psubd %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
-; SSSE3-NEXT:    pxor %xmm8, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm8, %xmm4
 ; SSSE3-NEXT:    psubd %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
-; SSSE3-NEXT:    pxor %xmm8, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
 ; SSSE3-NEXT:    psubd %xmm6, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa %xmm7, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    pxor %xmm9, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSSE3-NEXT:    psubd %xmm7, %xmm9
+; SSSE3-NEXT:    movdqa %xmm9, 48(%rdi)
 ; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
-; SSSE3-NEXT:    pxor %xmm8, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    psubd %xmm7, %xmm3
-; SSSE3-NEXT:    pxor %xmm3, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8
-; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
-; SSSE3-NEXT:    movdqa %xmm8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm8, %xmm0
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm5, %xmm2
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: usubo_v16i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm8
-; SSE41-NEXT:    psubd %xmm4, %xmm8
-; SSE41-NEXT:    pminud %xmm8, %xmm0
+; SSE41-NEXT:    pminud %xmm4, %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm12, %xmm12
+; SSE41-NEXT:    pxor %xmm12, %xmm0
 ; SSE41-NEXT:    movdqa %xmm1, %xmm9
-; SSE41-NEXT:    psubd %xmm5, %xmm9
-; SSE41-NEXT:    pminud %xmm9, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm9, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    psubd %xmm6, %xmm5
-; SSE41-NEXT:    pminud %xmm5, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm6
-; SSE41-NEXT:    psubd %xmm7, %xmm6
-; SSE41-NEXT:    pminud %xmm6, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm6, 48(%rdi)
-; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
-; SSE41-NEXT:    movdqa %xmm9, 16(%rdi)
+; SSE41-NEXT:    pminud %xmm5, %xmm9
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm9
+; SSE41-NEXT:    pxor %xmm12, %xmm9
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    pminud %xmm6, %xmm10
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm10
+; SSE41-NEXT:    pxor %xmm12, %xmm10
+; SSE41-NEXT:    movdqa %xmm3, %xmm11
+; SSE41-NEXT:    pminud %xmm7, %xmm11
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm11
+; SSE41-NEXT:    pxor %xmm12, %xmm11
+; SSE41-NEXT:    psubd %xmm4, %xmm8
+; SSE41-NEXT:    psubd %xmm5, %xmm1
+; SSE41-NEXT:    psubd %xmm6, %xmm2
+; SSE41-NEXT:    psubd %xmm7, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
 ; SSE41-NEXT:    movdqa %xmm8, (%rdi)
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    movdqa %xmm10, %xmm2
+; SSE41-NEXT:    movdqa %xmm11, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: usubo_v16i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpminud %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpsubd %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpminud %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpminud %xmm4, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm5, %xmm6
+; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm1, %xmm7
+; AVX1-NEXT:    vpackssdw %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT:    vpminud %xmm7, %xmm8, %xmm9
+; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm8, %xmm9
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm10
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm10, %xmm10
+; AVX1-NEXT:    vpackssdw %xmm9, %xmm10, %xmm9
+; AVX1-NEXT:    vpacksswb %xmm6, %xmm9, %xmm9
+; AVX1-NEXT:    vpcmpeqd %xmm10, %xmm10, %xmm10
+; AVX1-NEXT:    vpxor %xmm10, %xmm9, %xmm9
 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpminud %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm6
+; AVX1-NEXT:    vpsubd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmovsxbd %xmm9, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
+; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpacksswb %xmm6, %xmm6, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm10, %xmm1
+; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm5
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
 ; AVX1-NEXT:    vmovdqa %xmm4, 48(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm5, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm7, 16(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: usubo_v16i32:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm6
 ; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpminud %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpminud %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpacksswb %xmm4, %xmm4, %xmm1
 ; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm5, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
-; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT:    vmovdqa %ymm6, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpcmpnleud %zmm0, %zmm1, %k1
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
   %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
@@ -663,10 +686,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psubb %xmm1, %xmm4
-; SSE2-NEXT:    pminub %xmm4, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -690,10 +713,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    psubb %xmm1, %xmm4
-; SSSE3-NEXT:    pminub %xmm4, %xmm0
-; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSSE3-NEXT:    pminub %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm0, %xmm3
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -717,10 +740,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    psubb %xmm1, %xmm4
-; SSE41-NEXT:    pminub %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE41-NEXT:    pminub %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm3
 ; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -740,8 +763,8 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX1-LABEL: usubo_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpminub %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
@@ -759,8 +782,8 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX2-LABEL: usubo_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpminub %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
@@ -771,10 +794,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: usubo_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnleub %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpnleub %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
   %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -788,11 +811,11 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: usubo_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    psubw %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    psubw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
@@ -806,11 +829,11 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ; SSSE3-LABEL: usubo_v8i16:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
 ; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    psubw %xmm1, %xmm0
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
 ; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT:    psubw %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
@@ -825,9 +848,9 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psubw %xmm1, %xmm2
-; SSE41-NEXT:    pminuw %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqw %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm0, %xmm1
 ; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
@@ -838,35 +861,35 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ;
 ; AVX1-LABEL: usubo_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: usubo_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpminuw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnleuw %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
   %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
@@ -880,18 +903,18 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE-LABEL: usubo_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm3
 ; SSE-NEXT:    pxor %xmm2, %xmm3
-; SSE-NEXT:    psubq %xmm1, %xmm0
 ; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm4
 ; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3]
-; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    psubq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,3,3,3]
+; SSE-NEXT:    pand %xmm1, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3]
-; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
 ; SSE-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -900,32 +923,32 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: usubo_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: usubo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpnleuq %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
   %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
deleted file mode 100644
index 3224b8b63afd3..0000000000000
--- a/llvm/test/Transforms/InstCombine/pr170634.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
-define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
-; CHECK-LABEL: @func(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
-; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
-; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[RETURN:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
-; CHECK-NEXT:    br label [[RETURN]]
-; CHECK:       return:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL_0]]
-;
-entry:
-  %0 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
-  %1 = extractvalue { i64, i1 } %0, 1
-  %2 = extractvalue { i64, i1 } %0, 0
-  br i1 %1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  br label %return
-
-if.end:                                           ; preds = %entry
-  br label %return
-
-return:                                           ; preds = %if.end, %if.then
-  %retval.0 = phi i64 [ 291, %if.then ], [ %2, %if.end ]
-  ret i64 %retval.0
-}
-

>From 28c821c099a89a3c47d56dc6e881c2c4e99c245e Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Mon, 8 Dec 2025 17:38:10 +0530
Subject: [PATCH 6/8] Apply suggestions from code review

Co-authored-by: Jay Foad <jay.foad at gmail.com>
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8b46c4c1e66db..ad33f32160f0c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11467,7 +11467,7 @@ void TargetLowering::expandUADDSUBO(
   } else {
     ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
     SDValue CompareLHS = IsAdd ? Result : LHS;
-    SDValue CompareRHS = IsAdd ? LHS : RHS;
+    SDValue CompareRHS = RHS;
     SetCC = DAG.getSetCC(dl, SetCCType, CompareLHS, CompareRHS, CC);
   }
   Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);

>From 3de9265a40fbf04c03ad9098b5568157eafdd9fa Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Thu, 11 Dec 2025 16:15:46 +0530
Subject: [PATCH 7/8] updated CHECK-NEXT in the testcase

---
 llvm/test/CodeGen/AArch64/active_lane_mask.ll |   18 +-
 llvm/test/CodeGen/AArch64/vec_uaddo.ll        |   72 +-
 .../test/CodeGen/AMDGPU/carryout-selection.ll |  889 +----
 llvm/test/CodeGen/AMDGPU/carryout-selection.s | 3547 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/uaddo.ll             |   28 +-
 llvm/test/CodeGen/ARM/addsubo-legalization.ll |   14 +-
 llvm/test/CodeGen/PowerPC/sat-add.ll          |   16 +-
 llvm/test/CodeGen/RISCV/addcarry.ll           |   22 +-
 .../test/CodeGen/RISCV/arith-with-overflow.ll |    6 +-
 .../test/CodeGen/RISCV/overflow-intrinsics.ll |   18 +-
 llvm/test/CodeGen/RISCV/uadd_sat.ll           |   16 +-
 llvm/test/CodeGen/RISCV/uadd_sat_plus.ll      |   16 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |   66 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              |  243 +-
 llvm/test/CodeGen/RISCV/xqcia.ll              |    8 +-
 .../SPARC/umulo-128-legalisation-lowering.ll  |  164 +-
 .../CodeGen/Thumb2/mve-saturating-arith.ll    |   36 +-
 .../umulo-128-legalisation-lowering.ll        |  147 +-
 .../CodeGen/X86/expand-vp-int-intrinsics.ll   |    7 +-
 llvm/test/CodeGen/X86/sat-add.ll              |   29 +-
 llvm/test/CodeGen/X86/uadd_sat_vec.ll         |  271 +-
 llvm/test/CodeGen/X86/vec_uaddo.ll            |  834 ++--
 22 files changed, 4661 insertions(+), 1806 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/carryout-selection.s

diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 879dd4c12c0ba..b7a40a9f20519 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -304,12 +304,13 @@ define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) {
 ;
 ; CHECK-STREAMING-LABEL: lane_mask_v16i1_i8:
 ; CHECK-STREAMING:       // %bb.0:
-; CHECK-STREAMING-NEXT:    index z0.b, w0, #1
+; CHECK-STREAMING-NEXT:    index z0.b, #0, #1
 ; CHECK-STREAMING-NEXT:    mov z1.b, w0
 ; CHECK-STREAMING-NEXT:    ptrue p0.b, vl16
-; CHECK-STREAMING-NEXT:    cmphi p1.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-STREAMING-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-STREAMING-NEXT:    add z1.b, z1.b, z0.b
+; CHECK-STREAMING-NEXT:    cmphi p1.b, p0/z, z0.b, z1.b
+; CHECK-STREAMING-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-STREAMING-NEXT:    mov z1.b, w1
 ; CHECK-STREAMING-NEXT:    cmphi p0.b, p0/z, z1.b, z0.b
 ; CHECK-STREAMING-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
@@ -331,12 +332,13 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
 ;
 ; CHECK-STREAMING-LABEL: lane_mask_v8i1_i8:
 ; CHECK-STREAMING:       // %bb.0:
-; CHECK-STREAMING-NEXT:    index z0.b, w0, #1
+; CHECK-STREAMING-NEXT:    index z0.b, #0, #1
 ; CHECK-STREAMING-NEXT:    mov z1.b, w0
 ; CHECK-STREAMING-NEXT:    ptrue p0.b, vl8
-; CHECK-STREAMING-NEXT:    cmphi p1.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-STREAMING-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-STREAMING-NEXT:    add z1.b, z1.b, z0.b
+; CHECK-STREAMING-NEXT:    cmphi p1.b, p0/z, z0.b, z1.b
+; CHECK-STREAMING-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-STREAMING-NEXT:    mov z1.b, w1
 ; CHECK-STREAMING-NEXT:    cmphi p0.b, p0/z, z1.b, z0.b
 ; CHECK-STREAMING-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 2f51208e49351..e4891496e337c 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -19,9 +19,9 @@ declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x
 define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    str s1, [x0]
+; CHECK-NEXT:    add v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    cmhi v0.2s, v1.2s, v2.2s
+; CHECK-NEXT:    str s2, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
   %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
@@ -34,9 +34,9 @@ define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
 define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    add v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    cmhi v0.2s, v1.2s, v2.2s
+; CHECK-NEXT:    str d2, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
   %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -49,11 +49,11 @@ define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v3i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mov s2, v1.s[2]
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    str d1, [x0]
-; CHECK-NEXT:    str s2, [x0, #8]
+; CHECK-NEXT:    add v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    mov s1, v2.s[2]
+; CHECK-NEXT:    str d2, [x0]
+; CHECK-NEXT:    str s1, [x0, #8]
 ; CHECK-NEXT:    ret
   %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
   %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -66,9 +66,9 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    str q1, [x0]
+; CHECK-NEXT:    add v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmhi v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    str q2, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
   %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
@@ -94,21 +94,21 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    mov v0.s[2], w2
 ; CHECK-NEXT:    ld1 { v1.s }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #8
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    ld1 { v1.s }[3], [x8]
 ; CHECK-NEXT:    ldr x8, [sp, #32]
 ; CHECK-NEXT:    mov v0.s[3], w3
-; CHECK-NEXT:    cmhi v3.4s, v3.4s, v2.4s
-; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    mov w5, v3.s[1]
-; CHECK-NEXT:    fmov w4, s3
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    str q1, [x8]
-; CHECK-NEXT:    mov w1, v0.s[1]
-; CHECK-NEXT:    mov w2, v0.s[2]
-; CHECK-NEXT:    mov w3, v0.s[3]
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    cmhi v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    str d3, [x8, #16]
+; CHECK-NEXT:    mov w5, v2.s[1]
+; CHECK-NEXT:    fmov w4, s2
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmhi v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    mov w1, v1.s[1]
+; CHECK-NEXT:    mov w2, v1.s[2]
+; CHECK-NEXT:    mov w3, v1.s[3]
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
   %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
   %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -121,11 +121,11 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmhi v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    stp q2, q3, [x0]
+; CHECK-NEXT:    add v4.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v5.4s, v1.4s, v3.4s
+; CHECK-NEXT:    cmhi v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    cmhi v1.4s, v3.4s, v5.4s
+; CHECK-NEXT:    stp q4, q5, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
   %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
@@ -139,7 +139,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v4.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhi v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    cmhi v0.16b, v1.16b, v4.16b
 ; CHECK-NEXT:    str q4, [x0]
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    zip1 v2.8b, v0.8b, v0.8b
@@ -171,7 +171,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT:    cmhi v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    cmhi v0.8h, v1.8h, v2.8h
 ; CHECK-NEXT:    str q2, [x0]
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b
@@ -194,9 +194,9 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v1.2d, v0.2d, v1.2d
-; CHECK-NEXT:    cmhi v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    str q1, [x0]
+; CHECK-NEXT:    add v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    cmhi v0.2d, v1.2d, v2.2d
+; CHECK-NEXT:    str q2, [x0]
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
   %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 19b801a840ae0..2df0b8df50398 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -128,31 +128,6 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: sadd64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %add = add i64 %a, %b
   store i64 %add, ptr addrspace(1) %out
@@ -263,30 +238,6 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: sadd64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %add = add i64 20015998343286, %a
   store i64 %add, ptr addrspace(1) %out
@@ -389,29 +340,6 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: vadd64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -514,26 +442,6 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: vadd64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE2]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -642,24 +550,6 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: suaddo32
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
-; GCN-ISEL-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -796,35 +686,6 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1250-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: uaddo32_vcc_user
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
-; GCN-ISEL-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_ADD_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADD_CO_U32_e64_1]], implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -971,38 +832,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
 ; GFX1250-NEXT:    global_store_b8 v2, v3, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: suaddo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
-; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
-; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
-; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
-; GCN-ISEL-NEXT:   [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO killed [[COPY10]], killed [[COPY12]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO killed [[COPY9]], killed [[COPY11]], killed [[S_UADDO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_UADDO]], %subreg.sub0, killed [[S_ADD_C]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE4]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY13]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_ADD_C1]], implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -1149,40 +978,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: vuaddo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY3]], [[COPY1]](s32), 0, implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
-; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
-; GCN-ISEL-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY12]], [[COPY13]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADDC_U32_e64_1]], implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -1310,31 +1105,6 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: ssub64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %sub = sub i64 %a, %b
   store i64 %sub, ptr addrspace(1) %out
@@ -1445,30 +1215,6 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: ssub64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %sub = sub i64 20015998343286, %a
   store i64 %sub, ptr addrspace(1) %out
@@ -1571,29 +1317,6 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1]
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: vsub64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1696,26 +1419,6 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: vsub64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1825,24 +1528,6 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: susubo32
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
-; GCN-ISEL-NEXT:   [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -1979,35 +1664,6 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1250-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: usubo32_vcc_user
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
-; GCN-ISEL-NEXT:   [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORD_OFFSET killed [[V_SUB_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_SUB_CO_U32_e64_1]], implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -2029,21 +1685,20 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s11, 0xf000
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
-; CISI-NEXT:    v_mov_b32_e32 v0, s6
-; CISI-NEXT:    v_mov_b32_e32 v1, s7
-; CISI-NEXT:    s_sub_u32 s6, s4, s6
-; CISI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; CISI-NEXT:    s_subb_u32 s7, s5, s7
-; CISI-NEXT:    v_mov_b32_e32 v2, s6
+; CISI-NEXT:    s_sub_u32 s4, s4, s6
+; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
+; CISI-NEXT:    v_mov_b32_e32 v0, s4
+; CISI-NEXT:    v_mov_b32_e32 v1, s5
+; CISI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; CISI-NEXT:    s_mov_b32 s0, s2
 ; CISI-NEXT:    s_mov_b32 s1, s3
 ; CISI-NEXT:    s_mov_b32 s2, s10
 ; CISI-NEXT:    s_mov_b32 s3, s11
-; CISI-NEXT:    v_mov_b32_e32 v3, s7
-; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; CISI-NEXT:    s_waitcnt expcnt(0)
+; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
@@ -2052,18 +1707,16 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v5, s7
 ; VI-NEXT:    s_subb_u32 s1, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v7, s1
-; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -2072,14 +1725,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s14
-; GFX9-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-NEXT:    s_sub_u32 s0, s12, s14
 ; GFX9-NEXT:    s_subb_u32 s1, s13, s15
-; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX9-NEXT:    global_store_byte v2, v3, s[10:11]
 ; GFX9-NEXT:    s_endpgm
@@ -2092,8 +1743,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    s_sub_u32 s0, s12, s14
 ; GFX1010-NEXT:    s_subb_u32 s1, s13, s15
 ; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:    v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX1010-NEXT:    global_store_byte v2, v3, s[10:11]
@@ -2104,11 +1755,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT:    s_sub_u32 s8, s4, s6
-; GFX1030W32-NEXT:    v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
-; GFX1030W32-NEXT:    s_subb_u32 s9, s5, s7
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1030W32-NEXT:    s_sub_u32 s4, s4, s6
+; GFX1030W32-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
@@ -2119,11 +1770,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT:    s_sub_u32 s8, s4, s6
-; GFX1030W64-NEXT:    s_subb_u32 s9, s5, s7
-; GFX1030W64-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[4:5], s[6:7]
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s8
-; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s9
+; GFX1030W64-NEXT:    s_sub_u32 s4, s4, s6
+; GFX1030W64-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
@@ -2133,11 +1784,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sub_u32 s8, s4, s6
-; GFX11-NEXT:    v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
-; GFX11-NEXT:    s_subb_u32 s9, s5, s7
-; GFX11-NEXT:    v_mov_b32_e32 v0, s8
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT:    s_sub_u32 s4, s4, s6
+; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -2148,51 +1799,17 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
-; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_sub_nc_u64 s[0:1], s[12:13], s[14:15]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX1250-NEXT:    v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
+; GFX1250-NEXT:    s_sub_co_u32 s0, s12, s14
+; GFX1250-NEXT:    s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
 ; GFX1250-NEXT:    global_store_b8 v2, v3, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: susubo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
-; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
-; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE5]]
-; GCN-ISEL-NEXT:   [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[COPY13]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[COPY14]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -2212,22 +1829,21 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI:       ; %bb.0:
 ; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; CISI-NEXT:    v_mov_b32_e32 v1, 0
 ; CISI-NEXT:    s_mov_b32 s7, 0xf000
 ; CISI-NEXT:    s_mov_b32 s6, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_mov_b32 s4, s0
-; CISI-NEXT:    v_mov_b32_e32 v3, s9
-; CISI-NEXT:    v_sub_i32_e32 v2, vcc, s8, v0
-; CISI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CISI-NEXT:    v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
+; CISI-NEXT:    v_mov_b32_e32 v1, s9
+; CISI-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
 ; CISI-NEXT:    s_mov_b32 s5, s1
+; CISI-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CISI-NEXT:    s_mov_b32 s0, s2
 ; CISI-NEXT:    s_mov_b32 s1, s3
 ; CISI-NEXT:    s_mov_b32 s2, s6
 ; CISI-NEXT:    s_mov_b32 s3, s7
+; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CISI-NEXT:    s_waitcnt expcnt(0)
 ; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
@@ -2235,34 +1851,31 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mov_b32_e32 v7, s5
-; VI-NEXT:    v_sub_u32_e32 v6, vcc, s4, v0
-; VI-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v6, s5
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v0
+; VI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s3
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[6:7]
-; VI-NEXT:    flat_store_byte v[4:5], v0
+; VI-NEXT:    flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT:    flat_store_byte v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: vusubo64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: vusubo64:
@@ -2270,14 +1883,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    s_clause 0x1
 ; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT:    v_sub_co_u32 v2, s4, s6, v0
-; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v3, s4, s7, 0, s4
-; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1010-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX1010-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX1010-NEXT:    v_sub_co_u32 v0, s4, s6, v0
+; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: vusubo64:
@@ -2285,14 +1897,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    s_clause 0x1
 ; GFX1030W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W32-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT:    v_sub_co_u32 v2, s4, s6, v0
-; GFX1030W32-NEXT:    v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
-; GFX1030W32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1030W32-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX1030W32-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX1030W32-NEXT:    v_sub_co_u32 v0, s4, s6, v0
+; GFX1030W32-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1030W32-NEXT:    s_endpgm
 ;
 ; GFX1030W64-LABEL: vusubo64:
@@ -2300,14 +1911,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    s_clause 0x1
 ; GFX1030W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W64-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT:    v_sub_co_u32 v2, s[4:5], s6, v0
-; GFX1030W64-NEXT:    v_sub_co_ci_u32_e64 v3, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX1030W64-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX1030W64-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX1030W64-NEXT:    v_sub_co_u32 v0, s[4:5], s6, v0
+; GFX1030W64-NEXT:    v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1030W64-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: vusubo64:
@@ -2315,16 +1925,17 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX11-NEXT:    v_sub_co_u32 v2, s4, s6, v0
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_u32 v0, s4, s6, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
-; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: vusubo64:
@@ -2333,50 +1944,18 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_sub_co_u32 v0, s4, s6, v0
+; GFX1250-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_store_b64 v1, v[2:3], s[0:1]
-; GFX1250-NEXT:    global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
-; GCN-ISEL-LABEL: name: vusubo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit $exec
-; GCN-ISEL-NEXT:   [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
-; GCN-ISEL-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -3626,292 +3205,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:  .LBB16_4:
 ; GFX1250-NEXT:    ; implicit-def: $sgpr8_sgpr9
 ; GFX1250-NEXT:    s_branch .LBB16_2
-; GCN-ISEL-LABEL: name: sudiv64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT:   successors: %bb.3(0x50000000), %bb.1(0x30000000)
-; GCN-ISEL-NEXT:   liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_192 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3, [[COPY2]], %subreg.sub4, [[COPY1]], %subreg.sub5
-; GCN-ISEL-NEXT:   [[COPY7:%[0-9]+]]:sgpr_192 = COPY [[REG_SEQUENCE]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]]
-; GCN-ISEL-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_]], %subreg.sub0, killed [[COPY10]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-; GCN-ISEL-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
-; GCN-ISEL-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 killed [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-; GCN-ISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-; GCN-ISEL-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[V_CMP_NE_U64_e64_]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   $vcc = COPY [[S_AND_B64_]]
-; GCN-ISEL-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit $vcc
-; GCN-ISEL-NEXT:   S_BRANCH %bb.1
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT: bb.1.Flow:
-; GCN-ISEL-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.0, %6, %bb.3
-; GCN-ISEL-NEXT:   [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_1]], %bb.0, %40, %bb.3
-; GCN-ISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI1]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-; GCN-ISEL-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
-; GCN-ISEL-NEXT:   S_CMP_LG_U32 killed [[COPY12]], killed [[S_MOV_B32_1]], implicit-def $scc
-; GCN-ISEL-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit $scc
-; GCN-ISEL-NEXT:   S_BRANCH %bb.2
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT: bb.2 (%ir-block.7):
-; GCN-ISEL-NEXT:   successors: %bb.4(0x80000000)
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
-; GCN-ISEL-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT:   [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[S_MOV_B32_2]], [[COPY13]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY13]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[V_RCP_IFLAG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 killed [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1333788670, killed [[V_RCP_IFLAG_F32_e32_]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed [[V_MUL_F32_e32_]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_SUB_I32_]], [[COPY15]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e32_]], killed [[S_MUL_I32_]], implicit $exec
-; GCN-ISEL-NEXT:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
-; GCN-ISEL-NEXT:   [[COPY17:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_]]
-; GCN-ISEL-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY16]], killed [[COPY17]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY killed [[S_ADD_I32_]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY14]], [[COPY18]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-; GCN-ISEL-NEXT:   [[COPY19:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
-; GCN-ISEL-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY19]], [[S_MOV_B32_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY20:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_1:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY20]], [[COPY13]]
-; GCN-ISEL-NEXT:   [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY14]], killed [[S_MUL_I32_1]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_SUB_I32_1]], [[COPY13]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_SUB_I32_1]], [[COPY13]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_SUB_I32_2]], [[S_SUB_I32_1]], implicit $scc
-; GCN-ISEL-NEXT:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_1]], [[COPY21]], implicit $scc
-; GCN-ISEL-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_CSELECT_B32_1]], [[S_MOV_B32_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   S_CMP_GE_U32 killed [[S_CSELECT_B32_]], [[COPY13]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_2]], [[S_CSELECT_B32_1]], implicit $scc
-; GCN-ISEL-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_2]], %subreg.sub0, killed [[S_MOV_B32_4]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY22:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE4]]
-; GCN-ISEL-NEXT:   S_BRANCH %bb.4
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT: bb.3 (%ir-block.12):
-; GCN-ISEL-NEXT:   successors: %bb.1(0x80000000)
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT:   [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
-; GCN-ISEL-NEXT:   [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY23]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub1
-; GCN-ISEL-NEXT:   [[V_CVT_F32_U32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY24]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 1333788672
-; GCN-ISEL-NEXT:   [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_CVT_F32_U32_e64_1]], 0, killed [[S_MOV_B32_5]], 0, killed [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 1602224124
-; GCN-ISEL-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_6]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 796917760
-; GCN-ISEL-NEXT:   [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MUL_F32_e64_]], 0, killed [[S_MOV_B32_7]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 -813694976
-; GCN-ISEL-NEXT:   [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, killed [[S_MOV_B32_8]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-; GCN-ISEL-NEXT:   [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[S_MOV_B64_2]], [[COPY9]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY25:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub1
-; GCN-ISEL-NEXT:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_2:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[COPY26]]
-; GCN-ISEL-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub0
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[V_CVT_U32_F32_e64_]], implicit $exec
-; GCN-ISEL-NEXT:   [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_3:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY28]]
-; GCN-ISEL-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_2]]
-; GCN-ISEL-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY29]], killed [[S_MUL_I32_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_3]], killed [[S_MUL_I32_2]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_ADD_I32_4]], implicit $exec
-; GCN-ISEL-NEXT:   [[COPY30:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_4:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY30]], [[S_ADD_I32_4]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_4]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_3]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_5:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY31]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_MUL_I32_5]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_4]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE6]], killed [[REG_SEQUENCE5]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub0
-; GCN-ISEL-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub1
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_ADD_I32_4]], implicit $exec
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_MUL_I32_5]], implicit $exec
-; GCN-ISEL-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_6:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY34]], [[S_MUL_I32_5]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_6]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_6]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub0
-; GCN-ISEL-NEXT:   [[COPY36:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY32]], killed [[COPY35]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY33]], killed [[COPY36]], implicit-def $scc, implicit $scc
-; GCN-ISEL-NEXT:   [[COPY37:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_5]]
-; GCN-ISEL-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY37]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
-; GCN-ISEL-NEXT:   [[COPY38:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT:   [[S_MUL_I32_7:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY38]], [[S_ADD_I32_4]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE8:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_7]], %subreg.sub0, killed [[S_ADDC_U32_1]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE9:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_]], %subreg.sub0, killed [[S_ADDC_U32_]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY39:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE9]].sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY39]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U1:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE10]], killed [[REG_SEQUENCE8]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub0
-; GCN-ISEL-NEXT:   [[COPY41:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT:   [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[COPY41]], killed [[COPY40]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY42:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub1
-; GCN-ISEL-NEXT:   [[COPY43:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT:   [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[COPY43]], killed [[COPY42]], killed [[S_UADDO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_MUL_I32_8:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_ADD_C]]
-; GCN-ISEL-NEXT:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[COPY44]], implicit $exec
-; GCN-ISEL-NEXT:   [[COPY45:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_7]]
-; GCN-ISEL-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY45]], killed [[S_MUL_I32_8]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_MUL_I32_9:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[S_UADDO]]
-; GCN-ISEL-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_5]], killed [[S_MUL_I32_9]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY46]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MUL_I32_10:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_UADDO]]
-; GCN-ISEL-NEXT:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY47]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MUL_I32_11:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_MUL_I32_10]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE11:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_11]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_9]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub0
-; GCN-ISEL-NEXT:   [[COPY49:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub1
-; GCN-ISEL-NEXT:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY50]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MUL_I32_12:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_UADDO]], [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE12:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_12]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_10]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_11:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY51]], implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE13:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_11]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U2:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE13]], killed [[REG_SEQUENCE12]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY52:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub0
-; GCN-ISEL-NEXT:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY52]], killed [[COPY48]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY53]], killed [[COPY49]], implicit-def $scc, implicit $scc
-; GCN-ISEL-NEXT:   [[COPY54:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_8]]
-; GCN-ISEL-NEXT:   [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY54]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
-; GCN-ISEL-NEXT:   [[S_MUL_I32_13:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE14:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_13]], %subreg.sub0, killed [[S_ADDC_U32_3]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE15:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_1]], %subreg.sub0, killed [[S_ADDC_U32_2]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY55:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE15]].sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE16:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY55]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U3:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE16]], killed [[REG_SEQUENCE14]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY56:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub0
-; GCN-ISEL-NEXT:   [[S_UADDO2:%[0-9]+]]:sreg_32, [[S_UADDO3:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[S_UADDO]], killed [[COPY56]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY57:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub1
-; GCN-ISEL-NEXT:   [[S_ADD_C2:%[0-9]+]]:sreg_32, [[S_ADD_C3:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[S_ADD_C]], killed [[COPY57]], killed [[S_UADDO3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY58:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
-; GCN-ISEL-NEXT:   [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_12:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY59]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MUL_I32_14:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY58]], [[S_ADD_C2]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE17:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_14]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_12]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_13:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY60]], implicit $exec
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE18:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_13]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U4:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE18]], killed [[REG_SEQUENCE17]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY61:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub0
-; GCN-ISEL-NEXT:   [[COPY62:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub1
-; GCN-ISEL-NEXT:   [[COPY63:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
-; GCN-ISEL-NEXT:   [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_14:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY64]], implicit $exec
-; GCN-ISEL-NEXT:   [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_15:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY65]], implicit $exec
-; GCN-ISEL-NEXT:   [[S_MUL_I32_15:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_UADDO2]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE19:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_15]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_15]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY66:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub0
-; GCN-ISEL-NEXT:   [[COPY67:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY61]], killed [[COPY66]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY62]], killed [[COPY67]], implicit-def $scc, implicit $scc
-; GCN-ISEL-NEXT:   [[COPY68:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_14]]
-; GCN-ISEL-NEXT:   [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY68]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
-; GCN-ISEL-NEXT:   [[S_MUL_I32_16:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_ADD_C2]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE20:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_16]], %subreg.sub0, killed [[S_ADDC_U32_5]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE21:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_2]], %subreg.sub0, killed [[S_ADDC_U32_4]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY69:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE21]].sub1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE22:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY69]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_ADD_U5:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE22]], killed [[REG_SEQUENCE20]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY70:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub1
-; GCN-ISEL-NEXT:   [[S_MUL_I32_17:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY70]]
-; GCN-ISEL-NEXT:   [[COPY71:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub0
-; GCN-ISEL-NEXT:   [[COPY72:%[0-9]+]]:vgpr_32 = COPY [[COPY71]]
-; GCN-ISEL-NEXT:   [[V_MUL_HI_U32_e64_16:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY23]], [[COPY72]], implicit $exec
-; GCN-ISEL-NEXT:   [[COPY73:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_16]]
-; GCN-ISEL-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY73]], killed [[S_MUL_I32_17]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_MUL_I32_18:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY24]], [[COPY71]]
-; GCN-ISEL-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_7]], killed [[S_MUL_I32_18]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_SUB_I32_3:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY63]], [[S_ADD_I32_8]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_MUL_I32_19:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY71]]
-; GCN-ISEL-NEXT:   [[S_USUBO:%[0-9]+]]:sreg_32, [[S_USUBO1:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[COPY58]], killed [[S_MUL_I32_19]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_SUB_C:%[0-9]+]]:sreg_32, [[S_SUB_C1:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_I32_3]], [[COPY24]], [[S_USUBO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_USUBO2:%[0-9]+]]:sreg_32, [[S_USUBO3:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[S_USUBO]], [[COPY23]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[S_SUB_C2:%[0-9]+]]:sreg_32, [[S_SUB_C3:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_C]], [[S_MOV_B32_10]], killed [[S_USUBO3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_3:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT:   S_CMP_GE_U32 killed [[S_USUBO2]], [[COPY23]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_4:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT:   S_CMP_EQ_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_5:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_4]], killed [[S_CSELECT_B32_3]], implicit $scc
-; GCN-ISEL-NEXT:   [[COPY74:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_5]]
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE23:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY71]], %subreg.sub0, [[COPY70]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-; GCN-ISEL-NEXT:   [[S_ADD_U6:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY75:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B64_4:%[0-9]+]]:sreg_64 = S_MOV_B64 2
-; GCN-ISEL-NEXT:   [[S_ADD_U7:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_4]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   [[COPY76:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub0
-; GCN-ISEL-NEXT:   S_CMP_LG_U32 killed [[COPY74]], [[S_MOV_B32_10]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_6:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY76]], killed [[COPY75]], implicit $scc
-; GCN-ISEL-NEXT:   [[COPY77:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub1
-; GCN-ISEL-NEXT:   [[COPY78:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub1
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_7:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY78]], killed [[COPY77]], implicit $scc
-; GCN-ISEL-NEXT:   [[S_SUB_C4:%[0-9]+]]:sreg_32, [[S_SUB_C5:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO [[COPY63]], [[S_ADD_I32_8]], [[S_USUBO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_8:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT:   S_CMP_GE_U32 [[S_USUBO]], [[COPY23]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_9:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT:   S_CMP_EQ_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_10:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_9]], killed [[S_CSELECT_B32_8]], implicit $scc
-; GCN-ISEL-NEXT:   [[COPY79:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_10]]
-; GCN-ISEL-NEXT:   S_CMP_LG_U32 killed [[COPY79]], [[S_MOV_B32_10]], implicit-def $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_11:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_7]], [[COPY70]], implicit $scc
-; GCN-ISEL-NEXT:   [[S_CSELECT_B32_12:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_6]], [[COPY71]], implicit $scc
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE24:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_12]], %subreg.sub0, killed [[S_CSELECT_B32_11]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[S_MOV_B64_5:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-; GCN-ISEL-NEXT:   [[COPY80:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE24]]
-; GCN-ISEL-NEXT:   S_BRANCH %bb.1
-; GCN-ISEL-NEXT: {{  $}}
-; GCN-ISEL-NEXT: bb.4 (%ir-block.14):
-; GCN-ISEL-NEXT:   [[PHI2:%[0-9]+]]:sreg_64 = PHI [[PHI]], %bb.1, [[COPY22]], %bb.2
-; GCN-ISEL-NEXT:   [[COPY81:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
-; GCN-ISEL-NEXT:   [[COPY82:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE25:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY82]], %subreg.sub0, killed [[COPY81]], %subreg.sub1
-; GCN-ISEL-NEXT:   [[COPY83:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub1
-; GCN-ISEL-NEXT:   [[COPY84:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub0
-; GCN-ISEL-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT:   [[REG_SEQUENCE26:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY84]], %subreg.sub0, killed [[COPY83]], %subreg.sub1, killed [[S_MOV_B32_13]], %subreg.sub2, killed [[S_MOV_B32_12]], %subreg.sub3
-; GCN-ISEL-NEXT:   [[COPY85:%[0-9]+]]:vreg_64 = COPY [[PHI2]]
-; GCN-ISEL-NEXT:   BUFFER_STORE_DWORDX2_OFFSET [[COPY85]], killed [[REG_SEQUENCE26]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.16, addrspace 1)
-; GCN-ISEL-NEXT:   S_ENDPGM 0
   %result = udiv i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -3932,3 +3225,5 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-ISEL: {{.*}}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.s b/llvm/test/CodeGen/AMDGPU/carryout-selection.s
new file mode 100644
index 0000000000000..db52017f9091f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.s
@@ -0,0 +1,3547 @@
+--- |
+  ; ModuleID = '../llvm/test/CodeGen/AMDGPU/carryout-selection.ll'
+  source_filename = "../llvm/test/CodeGen/AMDGPU/carryout-selection.ll"
+  target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+  target triple = "amdgcn"
+  
+  define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+  entry:
+    %sadd64rr.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %sadd64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %0 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <3 x i64> %0, i32 0
+    %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %a.load2 = extractelement <3 x i64> %0, i32 1
+    %b.load3 = extractelement <3 x i64> %0, i32 2
+    %add = add i64 %a.load2, %b.load3
+    store i64 %add, ptr addrspace(1) %1, align 8
+    ret void
+  }
+  
+  define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
+  entry:
+    %sadd64ri.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %sadd64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <2 x i64> %0, i32 0
+    %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %a.load2 = extractelement <2 x i64> %0, i32 1
+    %add = add i64 20015998343286, %a.load2
+    store i64 %add, ptr addrspace(1) %1, align 8
+    ret void
+  }
+  
+  define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
+  entry:
+    %vadd64rr.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vadd64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <2 x i64> %0, i32 0
+    %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %a.load2 = extractelement <2 x i64> %0, i32 1
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %add = add i64 %a.load2, %tid.ext
+    store i64 %add, ptr addrspace(1) %1, align 8
+    ret void
+  }
+  
+  define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
+  entry:
+    %vadd64ri.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vadd64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %add = add i64 20015998343286, %tid.ext
+    store i64 %add, ptr addrspace(1) %out.load, align 8
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+    %suaddo32.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %suaddo32.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %suaddo32.kernarg.segment, i64 52, !amdgpu.uniform !0
+    %1 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+    %a.load1 = extractelement <2 x i32> %1, i32 0
+    %b.load2 = extractelement <2 x i32> %1, i32 1
+    %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a.load1, i32 %b.load2)
+    %val = extractvalue { i32, i1 } %uadd, 0
+    store i32 %val, ptr addrspace(1) %out.load, align 4
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+    %uaddo32_vcc_user.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %uaddo32_vcc_user.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %1 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <2 x i64> %1, i32 0
+    %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %carryout.load2 = extractelement <2 x i64> %1, i32 1
+    %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+    %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %uaddo32_vcc_user.kernarg.segment, i64 52, !amdgpu.uniform !0
+    %4 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+    %a.load3 = extractelement <2 x i32> %4, i32 0
+    %b.load4 = extractelement <2 x i32> %4, i32 1
+    %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a.load3, i32 %b.load4)
+    %val = extractvalue { i32, i1 } %uadd, 0
+    %carry = extractvalue { i32, i1 } %uadd, 1
+    store i32 %val, ptr addrspace(1) %2, align 4
+    store i1 %carry, ptr addrspace(1) %3, align 1
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #1 {
+    %suaddo64.kernarg.segment = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %suaddo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %1 = load <4 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <4 x i64> %1, i32 0
+    %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %carryout.load2 = extractelement <4 x i64> %1, i32 1
+    %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+    %a.load3 = extractelement <4 x i64> %1, i32 2
+    %b.load4 = extractelement <4 x i64> %1, i32 3
+    %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a.load3, i64 %b.load4)
+    %val = extractvalue { i64, i1 } %uadd, 0
+    %carry = extractvalue { i64, i1 } %uadd, 1
+    store i64 %val, ptr addrspace(1) %2, align 8
+    store i1 %carry, ptr addrspace(1) %3, align 1
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #1 {
+    %vuaddo64.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vuaddo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %1 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <3 x i64> %1, i32 0
+    %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %carryout.load2 = extractelement <3 x i64> %1, i32 1
+    %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+    %a.load3 = extractelement <3 x i64> %1, i32 2
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a.load3, i64 %tid.ext)
+    %val = extractvalue { i64, i1 } %uadd, 0
+    %carry = extractvalue { i64, i1 } %uadd, 1
+    store i64 %val, ptr addrspace(1) %2, align 8
+    store i1 %carry, ptr addrspace(1) %3, align 1
+    ret void
+  }
+  
+  define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+  entry:
+    %ssub64rr.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %ssub64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %0 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <3 x i64> %0, i32 0
+    %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %a.load2 = extractelement <3 x i64> %0, i32 1
+    %b.load3 = extractelement <3 x i64> %0, i32 2
+    %sub = sub i64 %a.load2, %b.load3
+    store i64 %sub, ptr addrspace(1) %1, align 8
+    ret void
+  }
+  
+  define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
+  entry:
+    %ssub64ri.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %ssub64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <2 x i64> %0, i32 0
+    %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %a.load2 = extractelement <2 x i64> %0, i32 1
+    %sub = sub i64 20015998343286, %a.load2
+    store i64 %sub, ptr addrspace(1) %1, align 8
+    ret void
+  }
+  
+  define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
+  entry:
+    %vsub64rr.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vsub64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <2 x i64> %0, i32 0
+    %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %a.load2 = extractelement <2 x i64> %0, i32 1
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %sub = sub i64 %a.load2, %tid.ext
+    store i64 %sub, ptr addrspace(1) %1, align 8
+    ret void
+  }
+  
+  define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
+  entry:
+    %vsub64ri.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vsub64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %sub = sub i64 20015998343286, %tid.ext
+    store i64 %sub, ptr addrspace(1) %out.load, align 8
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+    %susubo32.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %susubo32.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %susubo32.kernarg.segment, i64 52, !amdgpu.uniform !0
+    %1 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+    %a.load1 = extractelement <2 x i32> %1, i32 0
+    %b.load2 = extractelement <2 x i32> %1, i32 1
+    %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a.load1, i32 %b.load2)
+    %val = extractvalue { i32, i1 } %usub, 0
+    store i32 %val, ptr addrspace(1) %out.load, align 4
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+    %usubo32_vcc_user.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %usubo32_vcc_user.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %1 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <2 x i64> %1, i32 0
+    %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %carryout.load2 = extractelement <2 x i64> %1, i32 1
+    %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+    %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %usubo32_vcc_user.kernarg.segment, i64 52, !amdgpu.uniform !0
+    %4 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+    %a.load3 = extractelement <2 x i32> %4, i32 0
+    %b.load4 = extractelement <2 x i32> %4, i32 1
+    %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a.load3, i32 %b.load4)
+    %val = extractvalue { i32, i1 } %usub, 0
+    %carry = extractvalue { i32, i1 } %usub, 1
+    store i32 %val, ptr addrspace(1) %2, align 4
+    store i1 %carry, ptr addrspace(1) %3, align 1
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #1 {
+    %susubo64.kernarg.segment = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %susubo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %1 = load <4 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <4 x i64> %1, i32 0
+    %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %carryout.load2 = extractelement <4 x i64> %1, i32 1
+    %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+    %a.load3 = extractelement <4 x i64> %1, i32 2
+    %b.load4 = extractelement <4 x i64> %1, i32 3
+    %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a.load3, i64 %b.load4)
+    %val = extractvalue { i64, i1 } %usub, 0
+    %carry = extractvalue { i64, i1 } %usub, 1
+    store i64 %val, ptr addrspace(1) %2, align 8
+    store i1 %carry, ptr addrspace(1) %3, align 1
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #1 {
+    %vusubo64.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vusubo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %1 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %out.load1 = extractelement <3 x i64> %1, i32 0
+    %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    %carryout.load2 = extractelement <3 x i64> %1, i32 1
+    %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+    %a.load3 = extractelement <3 x i64> %1, i32 2
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a.load3, i64 %tid.ext)
+    %val = extractvalue { i64, i1 } %usub, 0
+    %carry = extractvalue { i64, i1 } %usub, 1
+    store i64 %val, ptr addrspace(1) %2, align 8
+    store i1 %carry, ptr addrspace(1) %3, align 1
+    ret void
+  }
+  
+  define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
+    %sudiv64.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %sudiv64.kernarg.segment, i64 36, !amdgpu.uniform !0
+    %1 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+    %x.load2 = extractelement <3 x i64> %1, i32 1
+    %y.load3 = extractelement <3 x i64> %1, i32 2
+    %2 = or i64 %x.load2, %y.load3
+    %3 = and i64 %2, -4294967296
+    %4 = icmp ne i64 %3, 0
+    br i1 %4, label %12, label %Flow, !amdgpu.uniform !0
+  
+  Flow:                                             ; preds = %12, %0
+    %5 = phi i64 [ %13, %12 ], [ poison, %0 ]
+    %6 = phi i1 [ false, %12 ], [ true, %0 ]
+    br i1 %6, label %7, label %14, !amdgpu.uniform !0
+  
+  7:                                                ; preds = %Flow
+    %8 = trunc i64 %y.load3 to i32
+    %9 = trunc i64 %x.load2 to i32
+    %10 = udiv i32 %9, %8
+    %11 = zext i32 %10 to i64
+    br label %14, !amdgpu.uniform !0
+  
+  12:                                               ; preds = %0
+    %13 = udiv i64 %x.load2, %y.load3
+    br label %Flow, !amdgpu.uniform !0
+  
+  14:                                               ; preds = %7, %Flow
+    %15 = phi i64 [ %5, %Flow ], [ %11, %7 ]
+    %out.load1 = extractelement <3 x i64> %1, i32 0
+    %16 = inttoptr i64 %out.load1 to ptr addrspace(1)
+    store i64 %15, ptr addrspace(1) %16, align 8
+    ret void
+  }
+  
+  ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+  declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #2
+  
+  ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+  declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #2
+  
+  ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+  declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #2
+  
+  ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+  declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #2
+  
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #3
+  
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3
+  
+  attributes #0 = { "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" }
+  attributes #1 = { nounwind }
+  attributes #2 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  
+  !0 = !{}
+...
+---
+name:            sadd64rr
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 28, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     true
+  waveLimiter:     true
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+    %13:sreg_32 = COPY %12.sub1
+    %14:sreg_32 = COPY %12.sub0
+    %15:sreg_32 = COPY %11.sub3
+    %16:sreg_32 = COPY %11.sub2
+    %17:sreg_32 = COPY %11.sub1
+    %18:sreg_32 = COPY %11.sub0
+    %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+    %20:sreg_32 = COPY %19.sub1
+    %21:sreg_32 = COPY %19.sub0
+    %22:sreg_32 = S_MOV_B32 61440
+    %23:sreg_32 = S_MOV_B32 -1
+    %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, killed %23, %subreg.sub2, killed %22, %subreg.sub3
+    %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+    %26:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+    %27:sreg_64 = S_ADD_U64_PSEUDO killed %25, killed %26, implicit-def dead $scc
+    %28:vreg_64 = COPY %27
+    BUFFER_STORE_DWORDX2_OFFSET killed %28, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            sadd64ri
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 27, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 16
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+    %15:sreg_32 = COPY %14.sub1
+    %16:sreg_32 = COPY %14.sub0
+    %17:sreg_32 = S_MOV_B32 61440
+    %18:sreg_32 = S_MOV_B32 -1
+    %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+    %20:sreg_32 = COPY %11.sub3
+    %21:sreg_32 = COPY %11.sub2
+    %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+    %23:sreg_32 = S_MOV_B32 4660
+    %24:sreg_32 = S_MOV_B32 1450743926
+    %25:sreg_64 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1
+    %26:sreg_64 = S_ADD_U64_PSEUDO killed %22, killed %25, implicit-def dead $scc
+    %27:vreg_64 = COPY %26
+    BUFFER_STORE_DWORDX2_OFFSET killed %27, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            vadd64rr
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 25, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 16
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+    %15:sreg_32 = COPY %14.sub1
+    %16:sreg_32 = COPY %14.sub0
+    %17:sreg_32 = S_MOV_B32 61440
+    %18:sreg_32 = S_MOV_B32 -1
+    %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+    %20:sreg_32 = COPY %11.sub3
+    %21:sreg_32 = COPY %11.sub2
+    %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+    %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %24:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %23, %subreg.sub1
+    %25:vreg_64 = V_ADD_U64_PSEUDO killed %22, killed %24, implicit-def dead $vcc, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFSET killed %25, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            vadd64ri
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 17, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 22, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 8
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_32 = S_MOV_B32 61440
+    %15:sreg_32 = S_MOV_B32 -1
+    %16:sgpr_128 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1, killed %15, %subreg.sub2, killed %14, %subreg.sub3
+    %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %18:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %17, %subreg.sub1
+    %19:sreg_32 = S_MOV_B32 4660
+    %20:sreg_32 = S_MOV_B32 1450743926
+    %21:sreg_64 = REG_SEQUENCE killed %20, %subreg.sub0, killed %19, %subreg.sub1
+    %22:vreg_64 = V_ADD_U64_PSEUDO killed %18, killed %21, implicit-def dead $vcc, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFSET killed %22, killed %16, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            suaddo32
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+    %13:sreg_32 = COPY %11.sub1
+    %14:sreg_32 = COPY %11.sub0
+    %15:sreg_32 = S_MOV_B32 61440
+    %16:sreg_32 = S_MOV_B32 -1
+    %17:sgpr_128 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1, killed %16, %subreg.sub2, killed %15, %subreg.sub3
+    %18:sreg_32 = COPY %12.sub0
+    %19:sreg_32 = COPY %12.sub1
+    %20:sreg_32 = S_ADD_I32 killed %18, killed %19, implicit-def dead $scc
+    %21:vgpr_32 = COPY %20
+    BUFFER_STORE_DWORD_OFFSET killed %21, killed %17, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            uaddo32_vcc_user
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 30, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 31, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 32, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+    %13:sreg_32 = COPY %11.sub1
+    %14:sreg_32 = COPY %11.sub0
+    %15:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+    %16:sreg_32 = COPY %15.sub1
+    %17:sreg_32 = COPY %15.sub0
+    %18:sreg_32 = S_MOV_B32 61440
+    %19:sreg_32 = S_MOV_B32 -1
+    %20:sgpr_128 = REG_SEQUENCE killed %17, %subreg.sub0, killed %16, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+    %21:sreg_32 = COPY %11.sub3
+    %22:sreg_32 = COPY %11.sub2
+    %23:sreg_64 = REG_SEQUENCE killed %22, %subreg.sub0, killed %21, %subreg.sub1
+    %24:sreg_32 = COPY %23.sub1
+    %25:sreg_32 = COPY %23.sub0
+    %26:sgpr_128 = REG_SEQUENCE killed %25, %subreg.sub0, killed %24, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+    %27:sreg_32 = COPY %12.sub0
+    %28:sreg_32 = COPY %12.sub1
+    %31:vgpr_32 = COPY killed %28
+    %29:vgpr_32, %30:sreg_64_xexec = V_ADD_CO_U32_e64 killed %27, %31, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %29, killed %20, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+    %32:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %30, implicit $exec
+    BUFFER_STORE_BYTE_OFFSET killed %32, killed %26, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            suaddo64
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_256, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 29, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 30, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 31, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 32, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 33, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 34, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 35, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 36, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 32
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_256 = S_LOAD_DWORDX8_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+    %15:sreg_32 = COPY %14.sub1
+    %16:sreg_32 = COPY %14.sub0
+    %17:sreg_32 = S_MOV_B32 61440
+    %18:sreg_32 = S_MOV_B32 -1
+    %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+    %20:sreg_32 = COPY %11.sub3
+    %21:sreg_32 = COPY %11.sub2
+    %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+    %23:sreg_32 = COPY %22.sub1
+    %24:sreg_32 = COPY %22.sub0
+    %25:sgpr_128 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+    %26:sreg_32 = COPY %11.sub5
+    %27:sreg_32 = COPY %11.sub4
+    %28:sreg_32 = COPY %11.sub7
+    %29:sreg_32 = COPY %11.sub6
+    %30:sreg_32, %31:sreg_64_xexec = S_UADDO_PSEUDO killed %27, killed %29, implicit-def dead $scc
+    %32:sreg_32, %33:sreg_64_xexec = S_ADD_CO_PSEUDO killed %26, killed %28, killed %31, implicit-def dead $scc
+    %34:sreg_64 = REG_SEQUENCE killed %30, %subreg.sub0, killed %32, %subreg.sub1
+    %35:vreg_64 = COPY %34
+    BUFFER_STORE_DWORDX2_OFFSET killed %35, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+    %36:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %33, implicit $exec
+    BUFFER_STORE_BYTE_OFFSET killed %36, killed %25, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            vuaddo64
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 30, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 31, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 32, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 33, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 34, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 35, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 36, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 37, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+    %13:sreg_32 = COPY %12.sub1
+    %14:sreg_32 = COPY %12.sub0
+    %15:sreg_32 = COPY %11.sub3
+    %16:sreg_32 = COPY %11.sub2
+    %17:sreg_32 = COPY %11.sub1
+    %18:sreg_32 = COPY %11.sub0
+    %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+    %20:sreg_32 = COPY %19.sub1
+    %21:sreg_32 = COPY %19.sub0
+    %22:sreg_32 = S_MOV_B32 61440
+    %23:sreg_32 = S_MOV_B32 -1
+    %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+    %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+    %26:sreg_32 = COPY %25.sub1
+    %27:sreg_32 = COPY %25.sub0
+    %28:sgpr_128 = REG_SEQUENCE killed %27, %subreg.sub0, killed %26, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+    %29:vgpr_32, %30:sreg_64_xexec = V_ADD_CO_U32_e64 killed %14, %0(s32), 0, implicit $exec
+    %31:sreg_32 = S_MOV_B32 0
+    %34:vgpr_32 = COPY killed %13
+    %35:vgpr_32 = COPY killed %31
+    %32:vgpr_32, %33:sreg_64_xexec = V_ADDC_U32_e64 %34, %35, killed %30, 0, implicit $exec
+    %36:vreg_64 = REG_SEQUENCE killed %29, %subreg.sub0, killed %32, %subreg.sub1
+    BUFFER_STORE_DWORDX2_OFFSET killed %36, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+    %37:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %33, implicit $exec
+    BUFFER_STORE_BYTE_OFFSET killed %37, killed %28, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            ssub64rr
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 28, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     true
+  waveLimiter:     true
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+    %13:sreg_32 = COPY %12.sub1
+    %14:sreg_32 = COPY %12.sub0
+    %15:sreg_32 = COPY %11.sub3
+    %16:sreg_32 = COPY %11.sub2
+    %17:sreg_32 = COPY %11.sub1
+    %18:sreg_32 = COPY %11.sub0
+    %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+    %20:sreg_32 = COPY %19.sub1
+    %21:sreg_32 = COPY %19.sub0
+    %22:sreg_32 = S_MOV_B32 61440
+    %23:sreg_32 = S_MOV_B32 -1
+    %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, killed %23, %subreg.sub2, killed %22, %subreg.sub3
+    %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+    %26:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+    %27:sreg_64 = S_SUB_U64_PSEUDO killed %25, killed %26, implicit-def dead $scc
+    %28:vreg_64 = COPY %27
+    BUFFER_STORE_DWORDX2_OFFSET killed %28, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            ssub64ri
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 27, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 16
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+    %15:sreg_32 = COPY %14.sub1
+    %16:sreg_32 = COPY %14.sub0
+    %17:sreg_32 = S_MOV_B32 61440
+    %18:sreg_32 = S_MOV_B32 -1
+    %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+    %20:sreg_32 = COPY %11.sub3
+    %21:sreg_32 = COPY %11.sub2
+    %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+    %23:sreg_32 = S_MOV_B32 4660
+    %24:sreg_32 = S_MOV_B32 1450743926
+    %25:sreg_64 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1
+    %26:sreg_64 = S_SUB_U64_PSEUDO killed %25, killed %22, implicit-def dead $scc
+    %27:vreg_64 = COPY %26
+    BUFFER_STORE_DWORDX2_OFFSET killed %27, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            vsub64rr
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 25, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 16
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+    %15:sreg_32 = COPY %14.sub1
+    %16:sreg_32 = COPY %14.sub0
+    %17:sreg_32 = S_MOV_B32 61440
+    %18:sreg_32 = S_MOV_B32 -1
+    %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+    %20:sreg_32 = COPY %11.sub3
+    %21:sreg_32 = COPY %11.sub2
+    %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+    %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %24:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %23, %subreg.sub1
+    %25:vreg_64 = V_SUB_U64_PSEUDO killed %22, killed %24, implicit-def dead $vcc, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFSET killed %25, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            vsub64ri
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 17, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 22, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 8
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0, $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_32 = S_MOV_B32 61440
+    %15:sreg_32 = S_MOV_B32 -1
+    %16:sgpr_128 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1, killed %15, %subreg.sub2, killed %14, %subreg.sub3
+    %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %18:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %17, %subreg.sub1
+    %19:sreg_32 = S_MOV_B32 4660
+    %20:sreg_32 = S_MOV_B32 1450743926
+    %21:sreg_64 = REG_SEQUENCE killed %20, %subreg.sub0, killed %19, %subreg.sub1
+    %22:vreg_64 = V_SUB_U64_PSEUDO killed %21, killed %18, implicit-def dead $vcc, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFSET killed %22, killed %16, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            susubo32
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+    %13:sreg_32 = COPY %11.sub1
+    %14:sreg_32 = COPY %11.sub0
+    %15:sreg_32 = S_MOV_B32 61440
+    %16:sreg_32 = S_MOV_B32 -1
+    %17:sgpr_128 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1, killed %16, %subreg.sub2, killed %15, %subreg.sub3
+    %18:sreg_32 = COPY %12.sub0
+    %19:sreg_32 = COPY %12.sub1
+    %20:sreg_32 = S_SUB_I32 killed %18, killed %19, implicit-def dead $scc
+    %21:vgpr_32 = COPY %20
+    BUFFER_STORE_DWORD_OFFSET killed %21, killed %17, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            usubo32_vcc_user
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 30, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 31, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 32, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+    %13:sreg_32 = COPY %11.sub1
+    %14:sreg_32 = COPY %11.sub0
+    %15:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+    %16:sreg_32 = COPY %15.sub1
+    %17:sreg_32 = COPY %15.sub0
+    %18:sreg_32 = S_MOV_B32 61440
+    %19:sreg_32 = S_MOV_B32 -1
+    %20:sgpr_128 = REG_SEQUENCE killed %17, %subreg.sub0, killed %16, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+    %21:sreg_32 = COPY %11.sub3
+    %22:sreg_32 = COPY %11.sub2
+    %23:sreg_64 = REG_SEQUENCE killed %22, %subreg.sub0, killed %21, %subreg.sub1
+    %24:sreg_32 = COPY %23.sub1
+    %25:sreg_32 = COPY %23.sub0
+    %26:sgpr_128 = REG_SEQUENCE killed %25, %subreg.sub0, killed %24, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+    %27:sreg_32 = COPY %12.sub0
+    %28:sreg_32 = COPY %12.sub1
+    %31:vgpr_32 = COPY killed %28
+    %29:vgpr_32, %30:sreg_64_xexec = V_SUB_CO_U32_e64 killed %27, %31, 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET killed %29, killed %20, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+    %32:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %30, implicit $exec
+    BUFFER_STORE_BYTE_OFFSET killed %32, killed %26, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            susubo64
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_256, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 29, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 30, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 31, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 32, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 33, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 34, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 35, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 36, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 32
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %11:sgpr_256 = S_LOAD_DWORDX8_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_32 = COPY %11.sub1
+    %13:sreg_32 = COPY %11.sub0
+    %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+    %15:sreg_32 = COPY %14.sub1
+    %16:sreg_32 = COPY %14.sub0
+    %17:sreg_32 = S_MOV_B32 61440
+    %18:sreg_32 = S_MOV_B32 -1
+    %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+    %20:sreg_32 = COPY %11.sub3
+    %21:sreg_32 = COPY %11.sub2
+    %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+    %23:sreg_32 = COPY %22.sub1
+    %24:sreg_32 = COPY %22.sub0
+    %25:sgpr_128 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+    %26:sreg_32 = COPY %11.sub5
+    %27:sreg_32 = COPY %11.sub4
+    %28:sreg_64 = REG_SEQUENCE killed %27, %subreg.sub0, killed %26, %subreg.sub1
+    %29:sreg_32 = COPY %11.sub7
+    %30:sreg_32 = COPY %11.sub6
+    %31:sreg_64 = REG_SEQUENCE killed %30, %subreg.sub0, killed %29, %subreg.sub1
+    %33:vreg_64 = COPY %31
+    %32:sreg_64_xexec = V_CMP_GT_U64_e64 %28, %33, implicit $exec
+    %34:sreg_64 = S_SUB_U64_PSEUDO %28, %31, implicit-def dead $scc
+    %35:vreg_64 = COPY %34
+    BUFFER_STORE_DWORDX2_OFFSET killed %35, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+    %36:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %32, implicit $exec
+    BUFFER_STORE_BYTE_OFFSET killed %36, killed %25, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            vusubo64
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 8, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 29, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 30, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 31, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 32, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 33, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 34, class: vgpr_32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $sgpr4_sgpr5
+  
+    %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %0:vgpr_32(s32) = COPY $vgpr0
+    %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+    %13:sreg_32 = COPY %12.sub1
+    %14:sreg_32 = COPY %12.sub0
+    %15:sreg_32 = COPY %11.sub3
+    %16:sreg_32 = COPY %11.sub2
+    %17:sreg_32 = COPY %11.sub1
+    %18:sreg_32 = COPY %11.sub0
+    %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+    %20:sreg_32 = COPY %19.sub1
+    %21:sreg_32 = COPY %19.sub0
+    %22:sreg_32 = S_MOV_B32 61440
+    %23:sreg_32 = S_MOV_B32 -1
+    %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+    %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+    %26:sreg_32 = COPY %25.sub1
+    %27:sreg_32 = COPY %25.sub0
+    %28:sgpr_128 = REG_SEQUENCE killed %27, %subreg.sub0, killed %26, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+    %29:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+    %30:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %31:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %30, %subreg.sub1
+    %32:sreg_64_xexec = V_CMP_GT_U64_e64 %29, %31, implicit $exec
+    %33:vreg_64 = V_SUB_U64_PSEUDO %29, %31, implicit-def dead $vcc, implicit $exec
+    BUFFER_STORE_DWORDX2_OFFSET killed %33, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+    %34:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %32, implicit $exec
+    BUFFER_STORE_BYTE_OFFSET killed %34, killed %28, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+    S_ENDPGM 0
+...
+---
+name:            sudiv64
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: sgpr_192, preferred-register: '', flags: [  ] }
+  - { id: 1, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 2, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 3, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 5, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 6, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 7, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 8, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 10, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 12, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 13, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 14, class: sgpr_64, preferred-register: '', flags: [  ] }
+  - { id: 15, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 16, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 17, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 18, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 19, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 20, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 21, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 22, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 23, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 24, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 25, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 26, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 27, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 28, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 29, class: sgpr_192, preferred-register: '', flags: [  ] }
+  - { id: 30, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 31, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 32, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 33, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 34, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 35, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 36, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 37, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 38, class: vreg_64, preferred-register: '', flags: [  ] }
+  - { id: 39, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 40, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 41, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 42, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 43, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 44, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 45, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 46, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 47, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 48, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 49, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 50, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 51, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 52, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 53, class: sgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 54, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 55, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 56, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 57, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 58, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 59, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 60, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 61, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 62, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 63, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 64, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 65, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 66, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 67, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 68, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 69, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 70, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 71, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 72, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 73, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 74, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 75, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 76, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 77, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 78, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 79, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 80, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 81, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 82, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 83, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 84, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 85, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 86, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 87, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 88, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 89, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 90, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 91, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 92, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 93, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 94, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 95, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 96, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 97, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 98, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 99, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 100, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 101, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 102, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 103, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 104, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 105, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 106, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 107, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 108, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 109, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 110, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 111, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 112, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 113, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 114, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 115, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 116, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 117, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 118, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 119, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 120, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 121, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 122, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 123, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 124, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 125, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 126, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 127, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 128, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 129, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 130, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 131, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 132, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 133, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 134, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 135, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 136, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 137, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 138, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 139, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 140, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 141, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 142, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 143, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 144, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 145, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 146, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 147, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 148, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 149, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 150, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 151, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 152, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 153, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 154, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 155, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 156, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 157, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 158, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 159, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 160, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 161, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 162, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 163, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 164, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 165, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 166, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 167, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 168, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 169, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 170, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 171, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 172, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 173, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 174, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 175, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 176, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 177, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 178, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 179, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 180, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 181, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 182, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 183, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 184, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 185, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 186, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 187, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 188, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 189, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 190, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 191, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 192, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 193, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 194, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 195, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 196, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 197, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 198, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 199, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 200, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 201, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 202, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 203, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 204, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 205, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 206, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 207, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 208, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 209, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 210, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 211, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 212, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 213, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 214, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 215, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 216, class: sreg_64_xexec, preferred-register: '', flags: [  ] }
+  - { id: 217, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 218, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 219, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 220, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 221, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 222, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 223, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 224, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 225, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 226, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 227, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 228, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 229, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 230, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 231, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 232, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 233, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 234, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 235, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 236, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 237, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 238, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 239, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 240, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 241, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 242, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 243, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 244, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 245, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 246, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 247, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 248, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 249, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 250, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 251, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 252, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 253, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 254, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 255, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 256, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 257, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 258, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 259, class: sreg_64, preferred-register: '', flags: [  ] }
+  - { id: 260, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 261, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 262, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 263, class: sreg_32, preferred-register: '', flags: [  ] }
+  - { id: 264, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 265, class: vreg_64, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%13' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 24
+  maxKernArgAlign: 8
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  numWaveDispatchSGPRs: 0
+  numWaveDispatchVGPRs: 0
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sp_reg'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    dispatchPtr:     { reg: '$sgpr0_sgpr1' }
+    queuePtr:        { reg: '$sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    dispatchID:      { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    workGroupIDY:    { reg: '$sgpr9' }
+    workGroupIDZ:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+    workItemIDX:     { reg: '$vgpr0' }
+    workItemIDY:     { reg: '$vgpr1' }
+    workItemIDZ:     { reg: '$vgpr2' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       10
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: ''
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+  numKernargPreloadSGPRs: 0
+  isWholeWaveFunction: false
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.3(0x50000000), %bb.1(0x30000000)
+    liveins: $sgpr4_sgpr5
+  
+    %13:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %21:sgpr_128 = S_LOAD_DWORDX4_IMM %13(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+    %22:sreg_64_xexec = S_LOAD_DWORDX2_IMM %13(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+    %23:sreg_32 = COPY %22.sub1
+    %24:sreg_32 = COPY %22.sub0
+    %25:sreg_32 = COPY %21.sub3
+    %26:sreg_32 = COPY %21.sub2
+    %27:sreg_32 = COPY %21.sub1
+    %28:sreg_32 = COPY %21.sub0
+    %29:sgpr_192 = REG_SEQUENCE killed %28, %subreg.sub0, killed %27, %subreg.sub1, %26, %subreg.sub2, %25, %subreg.sub3, %24, %subreg.sub4, %23, %subreg.sub5
+    %0:sgpr_192 = COPY %29
+    %30:sreg_64 = REG_SEQUENCE %26, %subreg.sub0, %25, %subreg.sub1
+    %1:sreg_64 = COPY %30
+    %31:sreg_64 = REG_SEQUENCE %24, %subreg.sub0, %23, %subreg.sub1
+    %2:sreg_64 = COPY %31
+    %32:sreg_64 = S_OR_B64 %30, %31, implicit-def dead $scc
+    %33:sreg_32 = COPY %32.sub1
+    %34:sreg_32 = S_MOV_B32 0
+    %35:sreg_64 = REG_SEQUENCE killed %34, %subreg.sub0, killed %33, %subreg.sub1
+    %36:sreg_64 = S_MOV_B64 0
+    %38:vreg_64 = COPY killed %36
+    %37:sreg_64 = V_CMP_NE_U64_e64 killed %35, %38, implicit $exec
+    %20:sreg_64 = S_MOV_B64 -1
+    %19:sreg_64 = IMPLICIT_DEF
+    %39:sreg_64 = S_AND_B64 $exec, killed %37, implicit-def dead $scc
+    $vcc = COPY %39
+    S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+    S_BRANCH %bb.1
+  
+  bb.1.Flow:
+    successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  
+    %3:sreg_64 = PHI %19, %bb.0, %6, %bb.3
+    %4:sreg_64_xexec = PHI %20, %bb.0, %40, %bb.3
+    %224:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %4, implicit $exec
+    %225:sreg_32 = S_MOV_B32 1
+    %226:sreg_32 = COPY %224
+    S_CMP_LG_U32 killed %226, killed %225, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.4, implicit $scc
+    S_BRANCH %bb.2
+  
+  bb.2 (%ir-block.7):
+    successors: %bb.4(0x80000000)
+  
+    %227:sreg_32 = COPY %2.sub0
+    %228:sreg_32 = COPY %1.sub0
+    %229:sreg_32 = S_MOV_B32 0
+    %230:sreg_32 = S_SUB_I32 killed %229, %227, implicit-def dead $scc
+    %231:vgpr_32 = V_CVT_F32_U32_e32 %227, implicit $mode, implicit $exec
+    %232:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 killed %231, implicit $mode, implicit $exec
+    %233:vgpr_32 = nofpexcept V_MUL_F32_e32 1333788670, killed %232, implicit $mode, implicit $exec
+    %234:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed %233, implicit $mode, implicit $exec
+    %236:sreg_32 = COPY %234
+    %235:sreg_32 = S_MUL_I32 killed %230, %236
+    %237:vgpr_32 = V_MUL_HI_U32_e64 %234, killed %235, implicit $exec
+    %239:sreg_32 = COPY %234
+    %240:sreg_32 = COPY %237
+    %238:sreg_32 = S_ADD_I32 %239, killed %240, implicit-def dead $scc
+    %242:vgpr_32 = COPY killed %238
+    %241:vgpr_32 = V_MUL_HI_U32_e64 %228, %242, implicit $exec
+    %243:sreg_32 = S_MOV_B32 1
+    %245:sreg_32 = COPY %241
+    %244:sreg_32 = S_ADD_I32 %245, %243, implicit-def dead $scc
+    %247:sreg_32 = COPY %241
+    %246:sreg_32 = S_MUL_I32 %247, %227
+    %248:sreg_32 = S_SUB_I32 %228, killed %246, implicit-def dead $scc
+    %249:sreg_32 = S_SUB_I32 %248, %227, implicit-def dead $scc
+    S_CMP_GE_U32 %248, %227, implicit-def $scc
+    %250:sreg_32 = S_CSELECT_B32 killed %249, %248, implicit $scc
+    %252:sreg_32 = COPY %241
+    %251:sreg_32 = S_CSELECT_B32 killed %244, %252, implicit $scc
+    %253:sreg_32 = S_ADD_I32 %251, %243, implicit-def dead $scc
+    S_CMP_GE_U32 killed %250, %227, implicit-def $scc
+    %254:sreg_32 = S_CSELECT_B32 killed %253, %251, implicit $scc
+    %255:sreg_32 = S_MOV_B32 0
+    %256:sreg_64 = REG_SEQUENCE killed %254, %subreg.sub0, killed %255, %subreg.sub1
+    %5:sreg_64 = COPY %256
+    S_BRANCH %bb.4
+  
+  bb.3 (%ir-block.12):
+    successors: %bb.1(0x80000000)
+  
+    %41:sreg_32 = COPY %2.sub0
+    %42:vgpr_32 = V_CVT_F32_U32_e64 %41, 0, 0, implicit $mode, implicit $exec
+    %43:sreg_32 = COPY %2.sub1
+    %44:vgpr_32 = V_CVT_F32_U32_e64 %43, 0, 0, implicit $mode, implicit $exec
+    %45:sgpr_32 = S_MOV_B32 1333788672
+    %46:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %44, 0, killed %45, 0, killed %42, 0, 0, implicit $mode, implicit $exec
+    %47:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed %46, 0, 0, implicit $mode, implicit $exec
+    %48:sgpr_32 = S_MOV_B32 1602224124
+    %49:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %47, 0, killed %48, 0, 0, implicit $mode, implicit $exec
+    %50:sgpr_32 = S_MOV_B32 796917760
+    %51:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %49, 0, killed %50, 0, 0, implicit $mode, implicit $exec
+    %52:vgpr_32 = nofpexcept V_TRUNC_F32_e64 0, killed %51, 0, 0, implicit $mode, implicit $exec
+    %53:sgpr_32 = S_MOV_B32 -813694976
+    %54:vgpr_32 = nofpexcept V_FMA_F32_e64 0, %52, 0, killed %53, 0, %49, 0, 0, implicit $mode, implicit $exec
+    %55:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed %54, 0, 0, implicit $mode, implicit $exec
+    %56:sreg_64 = S_MOV_B64 0
+    %57:sreg_64 = S_SUB_U64_PSEUDO killed %56, %2, implicit-def dead $scc
+    %58:sreg_32 = COPY %57.sub1
+    %60:sreg_32 = COPY %55
+    %59:sreg_32 = S_MUL_I32 %58, %60
+    %61:sreg_32 = COPY %57.sub0
+    %62:vgpr_32 = V_MUL_HI_U32_e64 %61, %55, implicit $exec
+    %63:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, %52, 0, 0, implicit $mode, implicit $exec
+    %65:sreg_32 = COPY %63
+    %64:sreg_32 = S_MUL_I32 %61, %65
+    %67:sreg_32 = COPY %62
+    %66:sreg_32 = S_ADD_I32 killed %67, killed %64, implicit-def dead $scc
+    %68:sreg_32 = S_ADD_I32 killed %66, killed %59, implicit-def dead $scc
+    %69:vgpr_32 = V_MUL_HI_U32_e64 %55, %68, implicit $exec
+    %71:sreg_32 = COPY %55
+    %70:sreg_32 = S_MUL_I32 %71, %68
+    %72:sreg_64 = REG_SEQUENCE killed %70, %subreg.sub0, killed %69, %subreg.sub1
+    %74:sreg_32 = COPY %55
+    %73:sreg_32 = S_MUL_I32 %61, %74
+    %75:vgpr_32 = V_MUL_HI_U32_e64 %55, %73, implicit $exec
+    %76:sreg_32 = S_MOV_B32 0
+    %77:sreg_64 = REG_SEQUENCE killed %75, %subreg.sub0, %76, %subreg.sub1
+    %78:sreg_64 = S_ADD_U64_PSEUDO killed %77, killed %72, implicit-def dead $scc
+    %79:sreg_32 = COPY %78.sub0
+    %80:sreg_32 = COPY %78.sub1
+    %81:vgpr_32 = V_MUL_HI_U32_e64 %63, %68, implicit $exec
+    %82:vgpr_32 = V_MUL_HI_U32_e64 %63, %73, implicit $exec
+    %84:sreg_32 = COPY %63
+    %83:sreg_32 = S_MUL_I32 %84, %73
+    %85:sreg_64 = REG_SEQUENCE killed %83, %subreg.sub0, killed %82, %subreg.sub1
+    %86:sreg_32 = COPY %85.sub0
+    %87:sreg_32 = COPY %85.sub1
+    %88:sreg_32 = S_MOV_B32 0
+    %89:sreg_32 = S_ADD_U32 killed %79, killed %86, implicit-def $scc
+    %90:sreg_32 = S_ADDC_U32 killed %80, killed %87, implicit-def $scc, implicit $scc
+    %92:sreg_32 = COPY %81
+    %91:sreg_32 = S_ADDC_U32 killed %92, %88, implicit-def dead $scc, implicit $scc
+    %94:sreg_32 = COPY %63
+    %93:sreg_32 = S_MUL_I32 %94, %68
+    %95:sreg_64 = REG_SEQUENCE killed %93, %subreg.sub0, killed %91, %subreg.sub1
+    %96:sreg_64 = REG_SEQUENCE killed %89, %subreg.sub0, killed %90, %subreg.sub1
+    %97:sreg_32 = COPY %96.sub1
+    %98:sreg_64 = REG_SEQUENCE killed %97, %subreg.sub0, %88, %subreg.sub1
+    %99:sreg_64 = S_ADD_U64_PSEUDO killed %98, killed %95, implicit-def dead $scc
+    %100:sreg_32 = COPY %99.sub0
+    %103:sreg_32 = COPY %55
+    %101:sreg_32, %102:sreg_64_xexec = S_UADDO_PSEUDO %103, killed %100, implicit-def dead $scc
+    %104:sreg_32 = COPY %99.sub1
+    %107:sreg_32 = COPY %63
+    %105:sreg_32, %106:sreg_64_xexec = S_ADD_CO_PSEUDO %107, killed %104, killed %102, implicit-def dead $scc
+    %108:sreg_32 = S_MUL_I32 %61, %105
+    %110:vgpr_32 = COPY %101
+    %109:vgpr_32 = V_MUL_HI_U32_e64 %61, %110, implicit $exec
+    %112:sreg_32 = COPY %109
+    %111:sreg_32 = S_ADD_I32 killed %112, killed %108, implicit-def dead $scc
+    %113:sreg_32 = S_MUL_I32 %58, %101
+    %114:sreg_32 = S_ADD_I32 killed %111, killed %113, implicit-def dead $scc
+    %116:vgpr_32 = COPY %114
+    %115:vgpr_32 = V_MUL_HI_U32_e64 %105, %116, implicit $exec
+    %117:sreg_32 = S_MUL_I32 %61, %101
+    %119:vgpr_32 = COPY %117
+    %118:vgpr_32 = V_MUL_HI_U32_e64 %105, %119, implicit $exec
+    %120:sreg_32 = S_MUL_I32 %105, %117
+    %121:sreg_64 = REG_SEQUENCE killed %120, %subreg.sub0, killed %118, %subreg.sub1
+    %122:sreg_32 = COPY %121.sub0
+    %123:sreg_32 = COPY %121.sub1
+    %125:vgpr_32 = COPY %114
+    %124:vgpr_32 = V_MUL_HI_U32_e64 %101, %125, implicit $exec
+    %126:sreg_32 = S_MUL_I32 %101, %114
+    %127:sreg_64 = REG_SEQUENCE killed %126, %subreg.sub0, killed %124, %subreg.sub1
+    %129:vgpr_32 = COPY %117
+    %128:vgpr_32 = V_MUL_HI_U32_e64 %101, %129, implicit $exec
+    %130:sreg_64 = REG_SEQUENCE killed %128, %subreg.sub0, %76, %subreg.sub1
+    %131:sreg_64 = S_ADD_U64_PSEUDO killed %130, killed %127, implicit-def dead $scc
+    %132:sreg_32 = COPY %131.sub0
+    %133:sreg_32 = COPY %131.sub1
+    %134:sreg_32 = S_ADD_U32 killed %132, killed %122, implicit-def $scc
+    %135:sreg_32 = S_ADDC_U32 killed %133, killed %123, implicit-def $scc, implicit $scc
+    %137:sreg_32 = COPY %115
+    %136:sreg_32 = S_ADDC_U32 killed %137, %88, implicit-def dead $scc, implicit $scc
+    %138:sreg_32 = S_MUL_I32 %105, %114
+    %139:sreg_64 = REG_SEQUENCE killed %138, %subreg.sub0, killed %136, %subreg.sub1
+    %140:sreg_64 = REG_SEQUENCE killed %134, %subreg.sub0, killed %135, %subreg.sub1
+    %141:sreg_32 = COPY %140.sub1
+    %142:sreg_64 = REG_SEQUENCE killed %141, %subreg.sub0, %88, %subreg.sub1
+    %143:sreg_64 = S_ADD_U64_PSEUDO killed %142, killed %139, implicit-def dead $scc
+    %144:sreg_32 = COPY %143.sub0
+    %145:sreg_32, %146:sreg_64_xexec = S_UADDO_PSEUDO %101, killed %144, implicit-def dead $scc
+    %147:sreg_32 = COPY %143.sub1
+    %148:sreg_32, %149:sreg_64_xexec = S_ADD_CO_PSEUDO %105, killed %147, killed %146, implicit-def dead $scc
+    %150:sreg_32 = COPY %1.sub0
+    %152:vgpr_32 = COPY %148
+    %151:vgpr_32 = V_MUL_HI_U32_e64 %150, %152, implicit $exec
+    %153:sreg_32 = S_MUL_I32 %150, %148
+    %154:sreg_64 = REG_SEQUENCE killed %153, %subreg.sub0, killed %151, %subreg.sub1
+    %156:vgpr_32 = COPY %145
+    %155:vgpr_32 = V_MUL_HI_U32_e64 %150, %156, implicit $exec
+    %157:sreg_64 = REG_SEQUENCE killed %155, %subreg.sub0, %76, %subreg.sub1
+    %158:sreg_64 = S_ADD_U64_PSEUDO killed %157, killed %154, implicit-def dead $scc
+    %159:sreg_32 = COPY %158.sub0
+    %160:sreg_32 = COPY %158.sub1
+    %161:sreg_32 = COPY %1.sub1
+    %163:vgpr_32 = COPY %148
+    %162:vgpr_32 = V_MUL_HI_U32_e64 %161, %163, implicit $exec
+    %165:vgpr_32 = COPY %145
+    %164:vgpr_32 = V_MUL_HI_U32_e64 %161, %165, implicit $exec
+    %166:sreg_32 = S_MUL_I32 %161, %145
+    %167:sreg_64 = REG_SEQUENCE killed %166, %subreg.sub0, killed %164, %subreg.sub1
+    %168:sreg_32 = COPY %167.sub0
+    %169:sreg_32 = COPY %167.sub1
+    %170:sreg_32 = S_ADD_U32 killed %159, killed %168, implicit-def $scc
+    %171:sreg_32 = S_ADDC_U32 killed %160, killed %169, implicit-def $scc, implicit $scc
+    %173:sreg_32 = COPY %162
+    %172:sreg_32 = S_ADDC_U32 killed %173, %88, implicit-def dead $scc, implicit $scc
+    %174:sreg_32 = S_MUL_I32 %161, %148
+    %175:sreg_64 = REG_SEQUENCE killed %174, %subreg.sub0, killed %172, %subreg.sub1
+    %176:sreg_64 = REG_SEQUENCE killed %170, %subreg.sub0, killed %171, %subreg.sub1
+    %177:sreg_32 = COPY %176.sub1
+    %178:sreg_64 = REG_SEQUENCE killed %177, %subreg.sub0, %88, %subreg.sub1
+    %179:sreg_64 = S_ADD_U64_PSEUDO killed %178, killed %175, implicit-def dead $scc
+    %180:sreg_32 = COPY %179.sub1
+    %181:sreg_32 = S_MUL_I32 %41, %180
+    %182:sreg_32 = COPY %179.sub0
+    %184:vgpr_32 = COPY %182
+    %183:vgpr_32 = V_MUL_HI_U32_e64 %41, %184, implicit $exec
+    %186:sreg_32 = COPY %183
+    %185:sreg_32 = S_ADD_I32 killed %186, killed %181, implicit-def dead $scc
+    %187:sreg_32 = S_MUL_I32 %43, %182
+    %188:sreg_32 = S_ADD_I32 killed %185, killed %187, implicit-def dead $scc
+    %189:sreg_32 = S_SUB_I32 %161, %188, implicit-def dead $scc
+    %190:sreg_32 = S_MUL_I32 %41, %182
+    %191:sreg_32, %192:sreg_64_xexec = S_USUBO_PSEUDO %150, killed %190, implicit-def dead $scc
+    %193:sreg_32, %194:sreg_64_xexec = S_SUB_CO_PSEUDO killed %189, %43, %192, implicit-def dead $scc
+    %195:sreg_32, %196:sreg_64_xexec = S_USUBO_PSEUDO %191, %41, implicit-def dead $scc
+    %197:sreg_32, %198:sreg_64_xexec = S_SUB_CO_PSEUDO killed %193, %88, killed %196, implicit-def dead $scc
+    S_CMP_GE_U32 %197, %43, implicit-def $scc
+    %199:sreg_32 = S_MOV_B32 -1
+    %200:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+    S_CMP_GE_U32 killed %195, %41, implicit-def $scc
+    %201:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+    S_CMP_EQ_U32 %197, %43, implicit-def $scc
+    %202:sreg_32 = S_CSELECT_B32 killed %201, killed %200, implicit $scc
+    %203:sreg_32 = COPY killed %202
+    %204:sreg_64 = REG_SEQUENCE %182, %subreg.sub0, %180, %subreg.sub1
+    %205:sreg_64 = S_MOV_B64 1
+    %206:sreg_64 = S_ADD_U64_PSEUDO %204, killed %205, implicit-def dead $scc
+    %207:sreg_32 = COPY %206.sub0
+    %208:sreg_64 = S_MOV_B64 2
+    %209:sreg_64 = S_ADD_U64_PSEUDO %204, killed %208, implicit-def dead $scc
+    %210:sreg_32 = COPY %209.sub0
+    S_CMP_LG_U32 killed %203, %88, implicit-def $scc
+    %211:sreg_32 = S_CSELECT_B32 killed %210, killed %207, implicit $scc
+    %212:sreg_32 = COPY %206.sub1
+    %213:sreg_32 = COPY %209.sub1
+    %214:sreg_32 = S_CSELECT_B32 killed %213, killed %212, implicit $scc
+    %215:sreg_32, %216:sreg_64_xexec = S_SUB_CO_PSEUDO %161, %188, %192, implicit-def dead $scc
+    S_CMP_GE_U32 %215, %43, implicit-def $scc
+    %217:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+    S_CMP_GE_U32 %191, %41, implicit-def $scc
+    %218:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+    S_CMP_EQ_U32 %215, %43, implicit-def $scc
+    %219:sreg_32 = S_CSELECT_B32 killed %218, killed %217, implicit $scc
+    %220:sreg_32 = COPY killed %219
+    S_CMP_LG_U32 killed %220, %88, implicit-def $scc
+    %221:sreg_32 = S_CSELECT_B32 killed %214, %180, implicit $scc
+    %222:sreg_32 = S_CSELECT_B32 killed %211, %182, implicit $scc
+    %223:sreg_64 = REG_SEQUENCE killed %222, %subreg.sub0, killed %221, %subreg.sub1
+    %40:sreg_64 = S_MOV_B64 0
+    %6:sreg_64 = COPY %223
+    S_BRANCH %bb.1
+  
+  bb.4 (%ir-block.14):
+    %7:sreg_64 = PHI %3, %bb.1, %5, %bb.2
+    %257:sreg_32 = COPY %0.sub1
+    %258:sreg_32 = COPY %0.sub0
+    %259:sreg_64 = REG_SEQUENCE killed %258, %subreg.sub0, killed %257, %subreg.sub1
+    %260:sreg_32 = COPY %259.sub1
+    %261:sreg_32 = COPY %259.sub0
+    %262:sreg_32 = S_MOV_B32 61440
+    %263:sreg_32 = S_MOV_B32 -1
+    %264:sgpr_128 = REG_SEQUENCE killed %261, %subreg.sub0, killed %260, %subreg.sub1, killed %263, %subreg.sub2, killed %262, %subreg.sub3
+    %265:vreg_64 = COPY %7
+    BUFFER_STORE_DWORDX2_OFFSET %265, killed %264, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.16, addrspace 1)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index b000fae124ede..b3025279081c2 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -665,18 +665,18 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    flat_load_ushort v4, v[0:1]
-; VI-NEXT:    flat_load_ushort v5, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    flat_load_ushort v4, v[2:3]
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v4, v5
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
 ; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v6, v4
@@ -690,10 +690,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
+; GFX9-NEXT:    global_load_ushort v1, v0, s[14:15]
+; GFX9-NEXT:    global_load_ushort v2, v0, s[12:13]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v2, v1, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
 ; GFX9-NEXT:    v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_short v0, v2, s[8:9]
@@ -706,10 +706,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
-; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT:    global_load_ushort v1, v0, s[14:15]
+; GFX10-NEXT:    global_load_ushort v2, v0, s[12:13]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v1
 ; GFX10-NEXT:    v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    global_store_short v0, v2, s[8:9]
@@ -722,10 +722,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[4:5]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index e5789de4ca415..62a8e97d979b0 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -10,14 +10,14 @@ define <2 x i1> @uaddo(ptr %ptr, ptr %ptr2) {
 ; CHECK-LABEL: uaddo:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    vmov r3, r2, d18
-; CHECK-NEXT:    vadd.i64 q8, q9, q8
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vmov r6, r7, d19
-; CHECK-NEXT:    vmov lr, r12, d16
-; CHECK-NEXT:    vmov r4, r5, d17
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vmov r3, r2, d16
+; CHECK-NEXT:    vadd.i64 q9, q9, q8
+; CHECK-NEXT:    vmov r6, r7, d17
+; CHECK-NEXT:    vmov lr, r12, d18
+; CHECK-NEXT:    vmov r4, r5, d19
 ; CHECK-NEXT:    subs.w r3, lr, r3
 ; CHECK-NEXT:    sbcs.w r2, r12, r2
 ; CHECK-NEXT:    mov.w r2, #0
@@ -33,7 +33,7 @@ define <2 x i1> @uaddo(ptr %ptr, ptr %ptr2) {
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    movne.w r1, #-1
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r0]
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
   %x = load <2 x i64>, ptr %ptr, align 8
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index 34b703a981105..78af50af71dee 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -113,10 +113,10 @@ define i32 @unsigned_sat_constant_i32_using_min(i32 %x) {
 define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i32_using_cmp_sum:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi 4, 3, 42
-; CHECK-NEXT:    cmplw 4, 3
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    isellt 3, 3, 4
+; CHECK-NEXT:    addi 3, 3, 42
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    cmplwi 3, 42
+; CHECK-NEXT:    isellt 3, 4, 3
 ; CHECK-NEXT:    blr
   %a = add i32 %x, 42
   %c = icmp ugt i32 %x, %a
@@ -303,10 +303,10 @@ define i32 @unsigned_sat_variable_i32_using_min(i32 %x, i32 %y) {
 define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_sum:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    add 4, 3, 4
-; CHECK-NEXT:    cmplw 4, 3
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    isellt 3, 3, 4
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    cmplw 3, 4
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    isellt 3, 4, 3
 ; CHECK-NEXT:    blr
   %a = add i32 %x, %y
   %c = icmp ugt i32 %x, %a
diff --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll
index 153c97faddec8..de1193fde98f8 100644
--- a/llvm/test/CodeGen/RISCV/addcarry.ll
+++ b/llvm/test/CodeGen/RISCV/addcarry.ll
@@ -16,7 +16,7 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 ; RISCV32-NEXT:    add a7, a7, t0
 ; RISCV32-NEXT:    add a4, t1, a6
 ; RISCV32-NEXT:    sltu a5, t1, a5
-; RISCV32-NEXT:    sltu a6, a4, t1
+; RISCV32-NEXT:    sltu a6, a4, a6
 ; RISCV32-NEXT:    add a5, a7, a5
 ; RISCV32-NEXT:    add a5, a5, a6
 ; RISCV32-NEXT:    mul a6, a1, a3
@@ -45,16 +45,16 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 define { i32, i32, i1 } @addcarry_2x32(i32 %x0, i32 %x1, i32 %y0, i32 %y1) nounwind {
 ; RISCV32-LABEL: addcarry_2x32:
 ; RISCV32:       # %bb.0:
-; RISCV32-NEXT:    add a3, a1, a3
-; RISCV32-NEXT:    add a4, a2, a4
-; RISCV32-NEXT:    sltu a1, a3, a1
-; RISCV32-NEXT:    sltu a2, a4, a2
-; RISCV32-NEXT:    add a1, a4, a1
-; RISCV32-NEXT:    sltu a4, a1, a4
-; RISCV32-NEXT:    or a2, a2, a4
-; RISCV32-NEXT:    sw a3, 0(a0)
-; RISCV32-NEXT:    sw a1, 4(a0)
-; RISCV32-NEXT:    sb a2, 8(a0)
+; RISCV32-NEXT:    add a1, a1, a3
+; RISCV32-NEXT:    add a2, a2, a4
+; RISCV32-NEXT:    sltu a3, a1, a3
+; RISCV32-NEXT:    sltu a4, a2, a4
+; RISCV32-NEXT:    add a2, a2, a3
+; RISCV32-NEXT:    sltu a3, a2, a3
+; RISCV32-NEXT:    or a3, a4, a3
+; RISCV32-NEXT:    sw a1, 0(a0)
+; RISCV32-NEXT:    sw a2, 4(a0)
+; RISCV32-NEXT:    sb a3, 8(a0)
 ; RISCV32-NEXT:    ret
   %t0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x0, i32 %y0)
   %s0 = extractvalue { i32, i1 } %t0, 0
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 84526a1fca0f9..af78eff8c3382 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -39,9 +39,9 @@ entry:
 define i1 @uadd(i32 %a, i32 %b, ptr %c) nounwind {
 ; RV32I-LABEL: uadd:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    add a1, a0, a1
-; RV32I-NEXT:    sltu a0, a1, a0
-; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    add a3, a0, a1
+; RV32I-NEXT:    sltu a0, a3, a1
+; RV32I-NEXT:    sw a3, 0(a2)
 ; RV32I-NEXT:    ret
 entry:
   %x = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 0306bb18c2aed..998bed0cc958a 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -69,12 +69,12 @@ define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
 ;
 ; RV64-LABEL: uaddo1_math_overflow_used:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    bltu a0, a1, .LBB1_2
+; RV64-NEXT:    add a3, a1, a0
+; RV64-NEXT:    bltu a3, a0, .LBB1_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 42
 ; RV64-NEXT:  .LBB1_2:
-; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    sd a3, 0(a2)
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %add = add i64 %b, %a
@@ -143,12 +143,12 @@ define i64 @uaddo2_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
 ;
 ; RV64-LABEL: uaddo2_math_overflow_used:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    bltu a0, a1, .LBB3_2
+; RV64-NEXT:    add a3, a1, a0
+; RV64-NEXT:    bltu a3, a0, .LBB3_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 42
 ; RV64-NEXT:  .LBB3_2:
-; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    sd a3, 0(a2)
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %add = add i64 %b, %a
@@ -217,12 +217,12 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
 ;
 ; RV64-LABEL: uaddo3_math_overflow_used:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    bltu a0, a1, .LBB5_2
+; RV64-NEXT:    add a3, a1, a0
+; RV64-NEXT:    bltu a3, a0, .LBB5_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 42
 ; RV64-NEXT:  .LBB5_2:
-; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    sd a3, 0(a2)
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
   %add = add i64 %b, %a
diff --git a/llvm/test/CodeGen/RISCV/uadd_sat.ll b/llvm/test/CodeGen/RISCV/uadd_sat.ll
index 4e0c4ab750592..a896aa89cfda2 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat.ll
@@ -7,10 +7,10 @@
 define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-LABEL: func:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a1, a0, a1
-; RV32I-NEXT:    sltu a0, a1, a0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    sltu a1, a0, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func:
@@ -56,10 +56,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: func2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a1, a0, a1
-; RV64I-NEXT:    sltu a0, a1, a0
-; RV64I-NEXT:    neg a0, a0
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    sltu a1, a0, a1
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func2:
diff --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
index a6afef4286dea..82bcff51b4c4d 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
@@ -8,10 +8,10 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32I-LABEL: func32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mul a1, a1, a2
-; RV32I-NEXT:    add a1, a0, a1
-; RV32I-NEXT:    sltu a0, a1, a0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    sltu a1, a0, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func32:
@@ -63,10 +63,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ;
 ; RV64I-LABEL: func64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a2, a0, a2
-; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    neg a0, a0
-; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    sltu a1, a0, a2
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZbb-LABEL: func64:
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 938e6550387f5..4b1e20696dbc8 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -23,50 +23,50 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    mul t2, t0, a6
 ; RISCV32-NEXT:    mulhu t3, t0, a6
 ; RISCV32-NEXT:    mul t4, a4, a5
-; RISCV32-NEXT:    mulhu t5, a4, a5
+; RISCV32-NEXT:    mulhu s0, a4, a5
 ; RISCV32-NEXT:    mul s2, t0, a5
-; RISCV32-NEXT:    mul t6, a7, a4
+; RISCV32-NEXT:    mul s1, a7, a4
 ; RISCV32-NEXT:    mul s3, a3, a6
-; RISCV32-NEXT:    mul s0, t0, a7
-; RISCV32-NEXT:    mul s1, a2, a4
+; RISCV32-NEXT:    mul t6, t0, a7
+; RISCV32-NEXT:    mul t5, a2, a4
 ; RISCV32-NEXT:    mul s4, a5, a3
-; RISCV32-NEXT:    add s1, s1, s0
-; RISCV32-NEXT:    mul s0, a1, a6
-; RISCV32-NEXT:    add s4, s0, s4
-; RISCV32-NEXT:    mulhu s5, t0, a5
+; RISCV32-NEXT:    add t6, t5, t6
+; RISCV32-NEXT:    mul t5, a1, a6
+; RISCV32-NEXT:    add t5, t5, s4
+; RISCV32-NEXT:    mulhu s4, t0, a5
 ; RISCV32-NEXT:    add t1, t2, t1
 ; RISCV32-NEXT:    sltu t2, t1, t2
 ; RISCV32-NEXT:    add t2, t3, t2
-; RISCV32-NEXT:    mulhu s0, a7, a4
+; RISCV32-NEXT:    mulhu t3, a7, a4
 ; RISCV32-NEXT:    add t1, t4, t1
-; RISCV32-NEXT:    sltu t3, t1, t4
-; RISCV32-NEXT:    add t3, t5, t3
-; RISCV32-NEXT:    mulhu t5, a3, a6
-; RISCV32-NEXT:    add t4, s3, t6
-; RISCV32-NEXT:    add s1, s0, s1
-; RISCV32-NEXT:    add t6, t5, s4
-; RISCV32-NEXT:    sltu s3, t4, s3
-; RISCV32-NEXT:    add t3, t2, t3
-; RISCV32-NEXT:    sltu t2, t3, t2
-; RISCV32-NEXT:    add s5, s5, t2
-; RISCV32-NEXT:    add s4, t6, s1
+; RISCV32-NEXT:    sltu t4, t1, t4
+; RISCV32-NEXT:    add t4, s0, t4
+; RISCV32-NEXT:    mulhu s0, a3, a6
+; RISCV32-NEXT:    add s5, s3, s1
+; RISCV32-NEXT:    add s1, t3, t6
+; RISCV32-NEXT:    add s0, s0, t5
+; RISCV32-NEXT:    sltu t3, s5, s3
+; RISCV32-NEXT:    add t4, t2, t4
+; RISCV32-NEXT:    sltu t2, t4, t2
+; RISCV32-NEXT:    add s4, s4, t2
+; RISCV32-NEXT:    add s3, s0, s1
+; RISCV32-NEXT:    add t4, s2, t4
+; RISCV32-NEXT:    add t2, t4, s5
+; RISCV32-NEXT:    sltu s2, t4, s2
+; RISCV32-NEXT:    sltu t4, t2, t4
+; RISCV32-NEXT:    add s2, s4, s2
+; RISCV32-NEXT:    add t3, s3, t3
 ; RISCV32-NEXT:    add t3, s2, t3
-; RISCV32-NEXT:    add t2, t3, t4
-; RISCV32-NEXT:    sltu s2, t3, s2
-; RISCV32-NEXT:    sltu t4, t2, t3
-; RISCV32-NEXT:    add s2, s5, s2
-; RISCV32-NEXT:    add s3, s4, s3
-; RISCV32-NEXT:    add t3, s2, s3
 ; RISCV32-NEXT:    add t3, t3, t4
 ; RISCV32-NEXT:    beq t3, s2, .LBB0_2
 ; RISCV32-NEXT:  # %bb.1: # %start
 ; RISCV32-NEXT:    sltu t4, t3, s2
 ; RISCV32-NEXT:  .LBB0_2: # %start
-; RISCV32-NEXT:    sltu s0, s1, s0
+; RISCV32-NEXT:    sltu t6, s1, t6
 ; RISCV32-NEXT:    snez s1, t0
 ; RISCV32-NEXT:    snez s2, a2
-; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    mulhu t6, a2, a4
+; RISCV32-NEXT:    sltu t5, s0, t5
+; RISCV32-NEXT:    mulhu s0, a2, a4
 ; RISCV32-NEXT:    mulhu t0, t0, a7
 ; RISCV32-NEXT:    or a2, a7, a2
 ; RISCV32-NEXT:    snez a7, a5
@@ -76,19 +76,19 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    or a3, a3, a1
 ; RISCV32-NEXT:    snez a1, a1
 ; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    snez t6, t6
+; RISCV32-NEXT:    snez s0, s0
 ; RISCV32-NEXT:    snez t0, t0
 ; RISCV32-NEXT:    and a1, a1, a7
 ; RISCV32-NEXT:    snez a6, a6
 ; RISCV32-NEXT:    snez a5, a5
 ; RISCV32-NEXT:    snez a2, a2
 ; RISCV32-NEXT:    snez a3, a3
-; RISCV32-NEXT:    or a7, s1, t6
+; RISCV32-NEXT:    or s0, s1, s0
 ; RISCV32-NEXT:    or a1, a1, a6
 ; RISCV32-NEXT:    and a2, a3, a2
-; RISCV32-NEXT:    or a3, a7, t0
+; RISCV32-NEXT:    or a3, s0, t0
 ; RISCV32-NEXT:    or a1, a1, a5
-; RISCV32-NEXT:    or a3, a3, s0
+; RISCV32-NEXT:    or a3, a3, t6
 ; RISCV32-NEXT:    or a1, a1, t5
 ; RISCV32-NEXT:    or a1, a2, a1
 ; RISCV32-NEXT:    or a1, a1, a3
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 0de2cbd76b749..7ea2ed7a2063b 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -464,9 +464,9 @@ entry:
 define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32-LABEL: uaddo.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    sltu a0, a1, a0
-; RV32-NEXT:    sw a1, 0(a2)
+; RV32-NEXT:    add a3, a0, a1
+; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    sw a3, 0(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo.i32:
@@ -478,9 +478,9 @@ define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZBA-LABEL: uaddo.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    add a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a1, a0
-; RV32ZBA-NEXT:    sw a1, 0(a2)
+; RV32ZBA-NEXT:    add a3, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a3, a1
+; RV32ZBA-NEXT:    sw a3, 0(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: uaddo.i32:
@@ -492,9 +492,9 @@ define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ;
 ; RV32ZICOND-LABEL: uaddo.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    sw a1, 0(a2)
+; RV32ZICOND-NEXT:    add a3, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a3, a1
+; RV32ZICOND-NEXT:    sw a3, 0(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: uaddo.i32:
@@ -515,7 +515,7 @@ define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
 ; RV32-LABEL: uaddo.i32.constant:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi a2, a0, -2
-; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    sltiu a0, a2, -2
 ; RV32-NEXT:    sw a2, 0(a1)
 ; RV32-NEXT:    ret
 ;
@@ -529,7 +529,7 @@ define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
 ; RV32ZBA-LABEL: uaddo.i32.constant:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    addi a2, a0, -2
-; RV32ZBA-NEXT:    sltu a0, a2, a0
+; RV32ZBA-NEXT:    sltiu a0, a2, -2
 ; RV32ZBA-NEXT:    sw a2, 0(a1)
 ; RV32ZBA-NEXT:    ret
 ;
@@ -543,7 +543,7 @@ define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
 ; RV32ZICOND-LABEL: uaddo.i32.constant:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    addi a2, a0, -2
-; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    sltiu a0, a2, -2
 ; RV32ZICOND-NEXT:    sw a2, 0(a1)
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -628,9 +628,9 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64-LABEL: uaddo.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    sltu a0, a1, a0
-; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    add a3, a0, a1
+; RV64-NEXT:    sltu a0, a3, a1
+; RV64-NEXT:    sd a3, 0(a2)
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: uaddo.i64:
@@ -649,9 +649,9 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZBA-LABEL: uaddo.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a1, a0
-; RV64ZBA-NEXT:    sd a1, 0(a2)
+; RV64ZBA-NEXT:    add a3, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a3, a1
+; RV64ZBA-NEXT:    sd a3, 0(a2)
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: uaddo.i64:
@@ -671,9 +671,9 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV64ZICOND-LABEL: uaddo.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a1, a0
-; RV64ZICOND-NEXT:    sd a1, 0(a2)
+; RV64ZICOND-NEXT:    add a3, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a3, a1
+; RV64ZICOND-NEXT:    sd a3, 0(a2)
 ; RV64ZICOND-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
@@ -1788,13 +1788,13 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    and a1, a1, t0
 ; RV32-NEXT:    snez a0, a0
 ; RV32-NEXT:    snez a2, a3
-; RV32-NEXT:    add a5, a7, a5
+; RV32-NEXT:    add a7, a7, a5
 ; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    sltu a1, a5, a7
+; RV32-NEXT:    sltu a1, a7, a5
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    sw t1, 0(a4)
-; RV32-NEXT:    sw a5, 4(a4)
+; RV32-NEXT:    sw a7, 4(a4)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.i64:
@@ -1820,13 +1820,13 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    and a1, a1, t0
 ; RV32ZBA-NEXT:    snez a0, a0
 ; RV32ZBA-NEXT:    snez a2, a3
-; RV32ZBA-NEXT:    add a5, a7, a5
+; RV32ZBA-NEXT:    add a7, a7, a5
 ; RV32ZBA-NEXT:    or a0, a1, a0
-; RV32ZBA-NEXT:    sltu a1, a5, a7
+; RV32ZBA-NEXT:    sltu a1, a7, a5
 ; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    or a0, a0, a1
 ; RV32ZBA-NEXT:    sw t1, 0(a4)
-; RV32ZBA-NEXT:    sw a5, 4(a4)
+; RV32ZBA-NEXT:    sw a7, 4(a4)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.i64:
@@ -1851,13 +1851,13 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    czero.eqz a1, a6, a1
 ; RV32ZICOND-NEXT:    snez a0, a0
 ; RV32ZICOND-NEXT:    snez a2, a3
-; RV32ZICOND-NEXT:    add a5, a7, a5
+; RV32ZICOND-NEXT:    add a7, a7, a5
 ; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    sltu a1, a5, a7
+; RV32ZICOND-NEXT:    sltu a1, a7, a5
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    sw t0, 0(a4)
-; RV32ZICOND-NEXT:    sw a5, 4(a4)
+; RV32ZICOND-NEXT:    sw a7, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.i64:
@@ -1884,12 +1884,12 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32-NEXT:    mulhu a5, a0, a3
 ; RV32-NEXT:    mulhu a1, a1, a3
 ; RV32-NEXT:    mul a3, a0, a3
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    add a5, a5, a4
 ; RV32-NEXT:    snez a0, a1
-; RV32-NEXT:    sltu a1, a4, a5
+; RV32-NEXT:    sltu a1, a5, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    sw a3, 0(a2)
-; RV32-NEXT:    sw a4, 4(a2)
+; RV32-NEXT:    sw a5, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo2.i64:
@@ -1911,12 +1911,12 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-NEXT:    mulhu a1, a1, a3
 ; RV32ZBA-NEXT:    mulhu a3, a0, a3
 ; RV32ZBA-NEXT:    sh2add a5, a5, a0
-; RV32ZBA-NEXT:    add a4, a3, a4
+; RV32ZBA-NEXT:    add a3, a3, a4
 ; RV32ZBA-NEXT:    snez a0, a1
-; RV32ZBA-NEXT:    sltu a1, a4, a3
+; RV32ZBA-NEXT:    sltu a1, a3, a4
 ; RV32ZBA-NEXT:    or a0, a0, a1
 ; RV32ZBA-NEXT:    sw a5, 0(a2)
-; RV32ZBA-NEXT:    sw a4, 4(a2)
+; RV32ZBA-NEXT:    sw a3, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo2.i64:
@@ -1937,12 +1937,12 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a3
 ; RV32ZICOND-NEXT:    mulhu a1, a1, a3
 ; RV32ZICOND-NEXT:    mul a3, a0, a3
-; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    add a5, a5, a4
 ; RV32ZICOND-NEXT:    snez a0, a1
-; RV32ZICOND-NEXT:    sltu a1, a4, a5
+; RV32ZICOND-NEXT:    sltu a1, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
-; RV32ZICOND-NEXT:    sw a4, 4(a2)
+; RV32ZICOND-NEXT:    sw a5, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo2.i64:
@@ -2266,7 +2266,7 @@ define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: uaddo.select.i32:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    bltu a2, a0, .LBB32_2
+; RV32-NEXT:    bltu a2, a1, .LBB32_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB32_2: # %entry
@@ -2284,7 +2284,7 @@ define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZBA-LABEL: uaddo.select.i32:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a2, a0, a1
-; RV32ZBA-NEXT:    bltu a2, a0, .LBB32_2
+; RV32ZBA-NEXT:    bltu a2, a1, .LBB32_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:  .LBB32_2: # %entry
@@ -2302,7 +2302,7 @@ define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZICOND-LABEL: uaddo.select.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a2, a0, a1
-; RV32ZICOND-NEXT:    sltu a2, a2, a0
+; RV32ZICOND-NEXT:    sltu a2, a2, a1
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
@@ -2326,8 +2326,8 @@ entry:
 define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32-LABEL: uaddo.not.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
@@ -2340,8 +2340,8 @@ define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZBA-LABEL: uaddo.not.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    add a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a0, a1
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2354,8 +2354,8 @@ define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
 ;
 ; RV32ZICOND-LABEL: uaddo.not.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a0, a1
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -2395,7 +2395,7 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 ; RV64-LABEL: uaddo.select.i64:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    bltu a2, a0, .LBB34_2
+; RV64-NEXT:    bltu a2, a1, .LBB34_2
 ; RV64-NEXT:  # %bb.1: # %entry
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:  .LBB34_2: # %entry
@@ -2423,7 +2423,7 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-LABEL: uaddo.select.i64:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    add a2, a0, a1
-; RV64ZBA-NEXT:    bltu a2, a0, .LBB34_2
+; RV64ZBA-NEXT:    bltu a2, a1, .LBB34_2
 ; RV64ZBA-NEXT:  # %bb.1: # %entry
 ; RV64ZBA-NEXT:    mv a0, a1
 ; RV64ZBA-NEXT:  .LBB34_2: # %entry
@@ -2451,7 +2451,7 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZICOND-LABEL: uaddo.select.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    add a2, a0, a1
-; RV64ZICOND-NEXT:    sltu a2, a2, a0
+; RV64ZICOND-NEXT:    sltu a2, a2, a1
 ; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
 ; RV64ZICOND-NEXT:    or a0, a0, a1
@@ -2479,8 +2479,8 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: uaddo.not.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
 ;
@@ -2499,8 +2499,8 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: uaddo.not.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a1, a0, a1
-; RV64ZBA-NEXT:    sltu a0, a1, a0
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    sltu a0, a0, a1
 ; RV64ZBA-NEXT:    xori a0, a0, 1
 ; RV64ZBA-NEXT:    ret
 ;
@@ -2520,8 +2520,8 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: uaddo.not.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a1, a0, a1
-; RV64ZICOND-NEXT:    sltu a0, a1, a0
+; RV64ZICOND-NEXT:    add a0, a0, a1
+; RV64ZICOND-NEXT:    sltu a0, a0, a1
 ; RV64ZICOND-NEXT:    xori a0, a0, 1
 ; RV64ZICOND-NEXT:    ret
 entry:
@@ -3623,8 +3623,8 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    snez a6, a6
 ; RV32-NEXT:    or a5, a5, a6
 ; RV32-NEXT:    mulhu a6, a0, a2
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    sltu a4, a4, a6
+; RV32-NEXT:    add a6, a6, a4
+; RV32-NEXT:    sltu a4, a6, a4
 ; RV32-NEXT:    mulhu a6, a3, a0
 ; RV32-NEXT:    snez a6, a6
 ; RV32-NEXT:    or a5, a5, a6
@@ -3657,8 +3657,8 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    snez a6, a6
 ; RV32ZBA-NEXT:    or a5, a5, a6
 ; RV32ZBA-NEXT:    mulhu a6, a0, a2
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    sltu a4, a4, a6
+; RV32ZBA-NEXT:    add a6, a6, a4
+; RV32ZBA-NEXT:    sltu a4, a6, a4
 ; RV32ZBA-NEXT:    mulhu a6, a3, a0
 ; RV32ZBA-NEXT:    snez a6, a6
 ; RV32ZBA-NEXT:    or a5, a5, a6
@@ -3690,8 +3690,8 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    snez a5, a5
 ; RV32ZICOND-NEXT:    or a5, a6, a5
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
-; RV32ZICOND-NEXT:    add a4, a6, a4
-; RV32ZICOND-NEXT:    sltu a4, a4, a6
+; RV32ZICOND-NEXT:    add a6, a6, a4
+; RV32ZICOND-NEXT:    sltu a4, a6, a4
 ; RV32ZICOND-NEXT:    mulhu a6, a3, a0
 ; RV32ZICOND-NEXT:    snez a6, a6
 ; RV32ZICOND-NEXT:    or a5, a5, a6
@@ -3732,9 +3732,9 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    snez a2, a2
 ; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    add a6, a6, a4
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    sltu a2, a4, a6
+; RV32-NEXT:    sltu a2, a6, a4
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    xori a0, a0, 1
@@ -3759,9 +3759,9 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    and a1, a1, a3
 ; RV32ZBA-NEXT:    snez a2, a2
 ; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    add a4, a6, a4
+; RV32ZBA-NEXT:    add a6, a6, a4
 ; RV32ZBA-NEXT:    or a1, a1, a2
-; RV32ZBA-NEXT:    sltu a2, a4, a6
+; RV32ZBA-NEXT:    sltu a2, a6, a4
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    xori a0, a0, 1
@@ -3785,9 +3785,9 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
 ; RV32ZICOND-NEXT:    snez a2, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    add a5, a5, a4
 ; RV32ZICOND-NEXT:    or a1, a1, a2
-; RV32ZICOND-NEXT:    sltu a2, a4, a5
+; RV32ZICOND-NEXT:    sltu a2, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
@@ -4005,8 +4005,8 @@ continue:
 define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 ; RV32-LABEL: uaddo.br.i32:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    bgeu a1, a0, .LBB54_2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    bgeu a0, a1, .LBB54_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -4028,8 +4028,8 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 ;
 ; RV32ZBA-LABEL: uaddo.br.i32:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    add a1, a0, a1
-; RV32ZBA-NEXT:    bgeu a1, a0, .LBB54_2
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    bgeu a0, a1, .LBB54_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
@@ -4051,8 +4051,8 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 ;
 ; RV32ZICOND-LABEL: uaddo.br.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    bgeu a1, a0, .LBB54_2
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:    bgeu a0, a1, .LBB54_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
@@ -4105,8 +4105,8 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64-LABEL: uaddo.br.i64:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    bgeu a1, a0, .LBB55_2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    bgeu a0, a1, .LBB55_2
 ; RV64-NEXT:  # %bb.1: # %overflow
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
@@ -4134,8 +4134,8 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZBA-LABEL: uaddo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    add a1, a0, a1
-; RV64ZBA-NEXT:    bgeu a1, a0, .LBB55_2
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    bgeu a0, a1, .LBB55_2
 ; RV64ZBA-NEXT:  # %bb.1: # %overflow
 ; RV64ZBA-NEXT:    li a0, 0
 ; RV64ZBA-NEXT:    ret
@@ -4164,8 +4164,8 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV64ZICOND-LABEL: uaddo.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    add a1, a0, a1
-; RV64ZICOND-NEXT:    bgeu a1, a0, .LBB55_2
+; RV64ZICOND-NEXT:    add a0, a0, a1
+; RV64ZICOND-NEXT:    bgeu a0, a1, .LBB55_2
 ; RV64ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV64ZICOND-NEXT:    li a0, 0
 ; RV64ZICOND-NEXT:    ret
@@ -5077,9 +5077,9 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    and a1, a1, a3
 ; RV32-NEXT:    snez a2, a2
 ; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    add a6, a6, a4
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    sltu a2, a4, a6
+; RV32-NEXT:    sltu a2, a6, a4
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    beqz a0, .LBB64_2
@@ -5114,9 +5114,9 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    and a1, a1, a3
 ; RV32ZBA-NEXT:    snez a2, a2
 ; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    add a4, a6, a4
+; RV32ZBA-NEXT:    add a6, a6, a4
 ; RV32ZBA-NEXT:    or a1, a1, a2
-; RV32ZBA-NEXT:    sltu a2, a4, a6
+; RV32ZBA-NEXT:    sltu a2, a6, a4
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    beqz a0, .LBB64_2
@@ -5150,9 +5150,9 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
 ; RV32ZICOND-NEXT:    snez a2, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    add a5, a5, a4
 ; RV32ZICOND-NEXT:    or a1, a1, a2
-; RV32ZICOND-NEXT:    sltu a2, a4, a5
+; RV32ZICOND-NEXT:    sltu a2, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    beqz a0, .LBB64_2
@@ -5302,7 +5302,7 @@ define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
 ; RV64-LABEL: uaddo.i64.constant:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addi a2, a0, 2
-; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sltiu a0, a2, 2
 ; RV64-NEXT:    sd a2, 0(a1)
 ; RV64-NEXT:    ret
 ;
@@ -5320,7 +5320,7 @@ define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
 ; RV64ZBA-LABEL: uaddo.i64.constant:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    addi a2, a0, 2
-; RV64ZBA-NEXT:    sltu a0, a2, a0
+; RV64ZBA-NEXT:    sltiu a0, a2, 2
 ; RV64ZBA-NEXT:    sd a2, 0(a1)
 ; RV64ZBA-NEXT:    ret
 ;
@@ -5338,7 +5338,7 @@ define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
 ; RV64ZICOND-LABEL: uaddo.i64.constant:
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    addi a2, a0, 2
-; RV64ZICOND-NEXT:    sltu a0, a2, a0
+; RV64ZICOND-NEXT:    sltiu a0, a2, 2
 ; RV64ZICOND-NEXT:    sd a2, 0(a1)
 ; RV64ZICOND-NEXT:    ret
 entry:
@@ -5364,9 +5364,10 @@ define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
 ;
 ; RV64-LABEL: uaddo.i64.constant_2048:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, 2047
-; RV64-NEXT:    addi a2, a2, 1
-; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    addi a0, a0, 2047
+; RV64-NEXT:    addi a2, a0, 1
+; RV64-NEXT:    srli a0, a2, 11
+; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    sd a2, 0(a1)
 ; RV64-NEXT:    ret
 ;
@@ -5384,9 +5385,10 @@ define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
 ;
 ; RV64ZBA-LABEL: uaddo.i64.constant_2048:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, 2047
-; RV64ZBA-NEXT:    addi a2, a2, 1
-; RV64ZBA-NEXT:    sltu a0, a2, a0
+; RV64ZBA-NEXT:    addi a0, a0, 2047
+; RV64ZBA-NEXT:    addi a2, a0, 1
+; RV64ZBA-NEXT:    srli a0, a2, 11
+; RV64ZBA-NEXT:    seqz a0, a0
 ; RV64ZBA-NEXT:    sd a2, 0(a1)
 ; RV64ZBA-NEXT:    ret
 ;
@@ -5404,9 +5406,10 @@ define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
 ;
 ; RV64ZICOND-LABEL: uaddo.i64.constant_2048:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, 2047
-; RV64ZICOND-NEXT:    addi a2, a2, 1
-; RV64ZICOND-NEXT:    sltu a0, a2, a0
+; RV64ZICOND-NEXT:    addi a0, a0, 2047
+; RV64ZICOND-NEXT:    addi a2, a0, 1
+; RV64ZICOND-NEXT:    srli a0, a2, 11
+; RV64ZICOND-NEXT:    seqz a0, a0
 ; RV64ZICOND-NEXT:    sd a2, 0(a1)
 ; RV64ZICOND-NEXT:    ret
 entry:
@@ -5432,10 +5435,11 @@ define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
 ;
 ; RV64-LABEL: uaddo.i64.constant_2049:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a2, a0, 2047
-; RV64-NEXT:    addi a2, a2, 2
-; RV64-NEXT:    sltu a0, a2, a0
-; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    lui a2, 1
+; RV64-NEXT:    addi a2, a2, -2047
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    sltu a0, a3, a2
+; RV64-NEXT:    sd a3, 0(a1)
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: uaddo.i64.constant_2049:
@@ -5452,10 +5456,11 @@ define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
 ;
 ; RV64ZBA-LABEL: uaddo.i64.constant_2049:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a2, a0, 2047
-; RV64ZBA-NEXT:    addi a2, a2, 2
-; RV64ZBA-NEXT:    sltu a0, a2, a0
-; RV64ZBA-NEXT:    sd a2, 0(a1)
+; RV64ZBA-NEXT:    lui a2, 1
+; RV64ZBA-NEXT:    addi a2, a2, -2047
+; RV64ZBA-NEXT:    add a3, a0, a2
+; RV64ZBA-NEXT:    sltu a0, a3, a2
+; RV64ZBA-NEXT:    sd a3, 0(a1)
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: uaddo.i64.constant_2049:
@@ -5472,10 +5477,11 @@ define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
 ;
 ; RV64ZICOND-LABEL: uaddo.i64.constant_2049:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    addi a2, a0, 2047
-; RV64ZICOND-NEXT:    addi a2, a2, 2
-; RV64ZICOND-NEXT:    sltu a0, a2, a0
-; RV64ZICOND-NEXT:    sd a2, 0(a1)
+; RV64ZICOND-NEXT:    lui a2, 1
+; RV64ZICOND-NEXT:    addi a2, a2, -2047
+; RV64ZICOND-NEXT:    add a3, a0, a2
+; RV64ZICOND-NEXT:    sltu a0, a3, a2
+; RV64ZICOND-NEXT:    sd a3, 0(a1)
 ; RV64ZICOND-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2049)
@@ -5504,8 +5510,9 @@ define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
 ;
 ; RV64-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    ld a1, 0(a0)
-; RV64-NEXT:    addi a0, a1, 2
+; RV64-NEXT:    ld a0, 0(a0)
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    li a1, 2
 ; RV64-NEXT:    bltu a0, a1, .LBB69_2
 ; RV64-NEXT:  # %bb.1: # %IfOverflow
 ; RV64-NEXT:    li a0, 0
@@ -5530,8 +5537,9 @@ define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
 ;
 ; RV64ZBA-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    ld a1, 0(a0)
-; RV64ZBA-NEXT:    addi a0, a1, 2
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    addi a0, a0, 2
+; RV64ZBA-NEXT:    li a1, 2
 ; RV64ZBA-NEXT:    bltu a0, a1, .LBB69_2
 ; RV64ZBA-NEXT:  # %bb.1: # %IfOverflow
 ; RV64ZBA-NEXT:    li a0, 0
@@ -5556,8 +5564,9 @@ define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
 ;
 ; RV64ZICOND-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
 ; RV64ZICOND:       # %bb.0: # %entry
-; RV64ZICOND-NEXT:    ld a1, 0(a0)
-; RV64ZICOND-NEXT:    addi a0, a1, 2
+; RV64ZICOND-NEXT:    ld a0, 0(a0)
+; RV64ZICOND-NEXT:    addi a0, a0, 2
+; RV64ZICOND-NEXT:    li a1, 2
 ; RV64ZICOND-NEXT:    bltu a0, a1, .LBB69_2
 ; RV64ZICOND-NEXT:  # %bb.1: # %IfOverflow
 ; RV64ZICOND-NEXT:    li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index 6d5fc765c49a8..8d1991f01f0dd 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -31,10 +31,10 @@ define i32 @addsat(i32 %a, i32 %b) {
 define i32 @addusat(i32 %a, i32 %b) {
 ; RV32I-LABEL: addusat:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a1, a0, a1
-; RV32I-NEXT:    sltu a0, a1, a0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    sltu a1, a0, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IXQCIA-LABEL: addusat:
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 40e90d6bdd6af..12818f3a7b78b 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -10,42 +10,42 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-NEXT:    mov %i3, %g2
 ; SPARC-NEXT:    mov %i2, %g4
 ; SPARC-NEXT:    umul %i2, %i5, %i2
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    ld [%fp+92], %l4
+; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    ld [%fp+92], %l1
 ; SPARC-NEXT:    umul %i4, %i3, %i3
-; SPARC-NEXT:    rd %y, %o1
+; SPARC-NEXT:    rd %y, %l6
 ; SPARC-NEXT:    ld [%fp+96], %g3
 ; SPARC-NEXT:    umul %i5, %g2, %l3
-; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    umul %l4, %i1, %l2
-; SPARC-NEXT:    rd %y, %l1
-; SPARC-NEXT:    add %i3, %i2, %i2
-; SPARC-NEXT:    umul %i0, %g3, %i3
-; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    add %o0, %i2, %o2
-; SPARC-NEXT:    umul %i1, %g3, %i2
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    umul %l1, %i1, %l7
 ; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    add %i3, %l2, %i3
-; SPARC-NEXT:    add %l0, %i3, %l2
-; SPARC-NEXT:    addcc %i2, %l3, %l3
+; SPARC-NEXT:    add %i3, %i2, %o1
+; SPARC-NEXT:    umul %i0, %g3, %i2
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    add %l4, %o1, %o2
+; SPARC-NEXT:    umul %i1, %g3, %i3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    add %i2, %l7, %l7
+; SPARC-NEXT:    add %l4, %l7, %o0
+; SPARC-NEXT:    addcc %i3, %l3, %l3
 ; SPARC-NEXT:    umul %g2, %g3, %i3
 ; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    addxcc %l2, %o2, %o4
+; SPARC-NEXT:    addxcc %o0, %o2, %o4
 ; SPARC-NEXT:    umul %g4, %g3, %g3
-; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    rd %y, %l4
 ; SPARC-NEXT:    addcc %g3, %i2, %i2
-; SPARC-NEXT:    addxcc %l5, 0, %g3
-; SPARC-NEXT:    umul %g2, %l4, %g2
-; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    addxcc %l4, 0, %g3
+; SPARC-NEXT:    umul %g2, %l1, %g2
+; SPARC-NEXT:    rd %y, %l4
 ; SPARC-NEXT:    addcc %g2, %i2, %i2
-; SPARC-NEXT:    addxcc %l5, 0, %g2
+; SPARC-NEXT:    addxcc %l4, 0, %g2
 ; SPARC-NEXT:    addcc %g3, %g2, %g2
 ; SPARC-NEXT:    addxcc %g0, 0, %g3
-; SPARC-NEXT:    umul %g4, %l4, %l5
+; SPARC-NEXT:    umul %g4, %l1, %l4
 ; SPARC-NEXT:    rd %y, %o3
-; SPARC-NEXT:    addcc %l5, %g2, %l5
+; SPARC-NEXT:    addcc %l4, %g2, %l4
 ; SPARC-NEXT:    addxcc %o3, %g3, %o3
-; SPARC-NEXT:    addcc %l5, %l3, %g2
+; SPARC-NEXT:    addcc %l4, %l3, %g2
 ; SPARC-NEXT:    addxcc %o3, %o4, %g3
 ; SPARC-NEXT:    mov 1, %l3
 ; SPARC-NEXT:    cmp %g3, %o3
@@ -54,101 +54,101 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-NEXT:  ! %bb.1: ! %start
 ; SPARC-NEXT:    mov %g0, %o4
 ; SPARC-NEXT:  .LBB0_2: ! %start
-; SPARC-NEXT:    cmp %g2, %l5
+; SPARC-NEXT:    cmp %g2, %l4
 ; SPARC-NEXT:    bcs .LBB0_4
-; SPARC-NEXT:    mov %l3, %l5
+; SPARC-NEXT:    mov %l3, %l4
 ; SPARC-NEXT:  ! %bb.3: ! %start
-; SPARC-NEXT:    mov %g0, %l5
+; SPARC-NEXT:    mov %g0, %l4
 ; SPARC-NEXT:  .LBB0_4: ! %start
 ; SPARC-NEXT:    cmp %g3, %o3
 ; SPARC-NEXT:    be .LBB0_6
 ; SPARC-NEXT:    nop
 ; SPARC-NEXT:  ! %bb.5: ! %start
-; SPARC-NEXT:    mov %o4, %l5
+; SPARC-NEXT:    mov %o4, %l4
 ; SPARC-NEXT:  .LBB0_6: ! %start
-; SPARC-NEXT:    cmp %g4, 0
-; SPARC-NEXT:    bne .LBB0_8
-; SPARC-NEXT:    mov %l3, %o3
+; SPARC-NEXT:    cmp %o2, %o1
+; SPARC-NEXT:    bcs .LBB0_8
+; SPARC-NEXT:    mov %l3, %o1
 ; SPARC-NEXT:  ! %bb.7: ! %start
-; SPARC-NEXT:    mov %g0, %o3
+; SPARC-NEXT:    mov %g0, %o1
 ; SPARC-NEXT:  .LBB0_8: ! %start
-; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    cmp %g4, 0
 ; SPARC-NEXT:    bne .LBB0_10
-; SPARC-NEXT:    mov %l3, %o4
+; SPARC-NEXT:    mov %l3, %o2
 ; SPARC-NEXT:  ! %bb.9: ! %start
-; SPARC-NEXT:    mov %g0, %o4
+; SPARC-NEXT:    mov %g0, %o2
 ; SPARC-NEXT:  .LBB0_10: ! %start
-; SPARC-NEXT:    cmp %o1, 0
+; SPARC-NEXT:    cmp %i4, 0
 ; SPARC-NEXT:    bne .LBB0_12
-; SPARC-NEXT:    mov %l3, %o1
+; SPARC-NEXT:    mov %l3, %o3
 ; SPARC-NEXT:  ! %bb.11: ! %start
-; SPARC-NEXT:    mov %g0, %o1
+; SPARC-NEXT:    mov %g0, %o3
 ; SPARC-NEXT:  .LBB0_12: ! %start
-; SPARC-NEXT:    cmp %l7, 0
+; SPARC-NEXT:    cmp %l6, 0
 ; SPARC-NEXT:    bne .LBB0_14
-; SPARC-NEXT:    mov %l3, %l7
+; SPARC-NEXT:    mov %l3, %l6
 ; SPARC-NEXT:  ! %bb.13: ! %start
-; SPARC-NEXT:    mov %g0, %l7
+; SPARC-NEXT:    mov %g0, %l6
 ; SPARC-NEXT:  .LBB0_14: ! %start
-; SPARC-NEXT:    cmp %o2, %o0
-; SPARC-NEXT:    bcs .LBB0_16
-; SPARC-NEXT:    mov %l3, %g4
+; SPARC-NEXT:    cmp %l5, 0
+; SPARC-NEXT:    bne .LBB0_16
+; SPARC-NEXT:    mov %l3, %l5
 ; SPARC-NEXT:  ! %bb.15: ! %start
-; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:    mov %g0, %l5
 ; SPARC-NEXT:  .LBB0_16: ! %start
-; SPARC-NEXT:    cmp %l4, 0
-; SPARC-NEXT:    bne .LBB0_18
-; SPARC-NEXT:    mov %l3, %l4
+; SPARC-NEXT:    cmp %o0, %l7
+; SPARC-NEXT:    bcs .LBB0_18
+; SPARC-NEXT:    mov %l3, %g4
 ; SPARC-NEXT:  ! %bb.17: ! %start
-; SPARC-NEXT:    mov %g0, %l4
+; SPARC-NEXT:    mov %g0, %g4
 ; SPARC-NEXT:  .LBB0_18: ! %start
-; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    cmp %l1, 0
 ; SPARC-NEXT:    bne .LBB0_20
-; SPARC-NEXT:    mov %l3, %o0
+; SPARC-NEXT:    mov %l3, %l1
 ; SPARC-NEXT:  ! %bb.19: ! %start
-; SPARC-NEXT:    mov %g0, %o0
+; SPARC-NEXT:    mov %g0, %l1
 ; SPARC-NEXT:  .LBB0_20: ! %start
-; SPARC-NEXT:    cmp %l6, 0
+; SPARC-NEXT:    cmp %i0, 0
 ; SPARC-NEXT:    bne .LBB0_22
-; SPARC-NEXT:    mov %l3, %l6
+; SPARC-NEXT:    mov %l3, %o0
 ; SPARC-NEXT:  ! %bb.21: ! %start
-; SPARC-NEXT:    mov %g0, %l6
+; SPARC-NEXT:    mov %g0, %o0
 ; SPARC-NEXT:  .LBB0_22: ! %start
-; SPARC-NEXT:    and %o4, %o3, %o2
-; SPARC-NEXT:    cmp %l1, 0
-; SPARC-NEXT:    and %o0, %l4, %o0
+; SPARC-NEXT:    and %o3, %o2, %l7
+; SPARC-NEXT:    cmp %l2, 0
+; SPARC-NEXT:    and %o0, %l1, %l2
 ; SPARC-NEXT:    bne .LBB0_24
-; SPARC-NEXT:    mov %l3, %l1
+; SPARC-NEXT:    mov %l3, %o0
 ; SPARC-NEXT:  ! %bb.23: ! %start
-; SPARC-NEXT:    mov %g0, %l1
+; SPARC-NEXT:    mov %g0, %o0
 ; SPARC-NEXT:  .LBB0_24: ! %start
-; SPARC-NEXT:    or %o2, %o1, %l4
-; SPARC-NEXT:    cmp %l2, %l0
-; SPARC-NEXT:    or %o0, %l6, %l6
-; SPARC-NEXT:    bcs .LBB0_26
-; SPARC-NEXT:    mov %l3, %l0
+; SPARC-NEXT:    or %l7, %l6, %l1
+; SPARC-NEXT:    cmp %l0, 0
+; SPARC-NEXT:    or %l2, %o0, %l2
+; SPARC-NEXT:    bne .LBB0_26
+; SPARC-NEXT:    mov %l3, %l6
 ; SPARC-NEXT:  ! %bb.25: ! %start
-; SPARC-NEXT:    mov %g0, %l0
+; SPARC-NEXT:    mov %g0, %l6
 ; SPARC-NEXT:  .LBB0_26: ! %start
-; SPARC-NEXT:    or %l4, %l7, %l2
+; SPARC-NEXT:    or %l1, %l5, %l0
 ; SPARC-NEXT:    orcc %i5, %i4, %g0
-; SPARC-NEXT:    or %l6, %l1, %l1
+; SPARC-NEXT:    or %l2, %l6, %l1
 ; SPARC-NEXT:    bne .LBB0_28
 ; SPARC-NEXT:    mov %l3, %i4
 ; SPARC-NEXT:  ! %bb.27: ! %start
 ; SPARC-NEXT:    mov %g0, %i4
 ; SPARC-NEXT:  .LBB0_28: ! %start
-; SPARC-NEXT:    or %l2, %g4, %i5
+; SPARC-NEXT:    or %l0, %o1, %i5
 ; SPARC-NEXT:    orcc %i1, %i0, %g0
 ; SPARC-NEXT:    bne .LBB0_30
-; SPARC-NEXT:    or %l1, %l0, %i0
+; SPARC-NEXT:    or %l1, %g4, %i0
 ; SPARC-NEXT:  ! %bb.29: ! %start
 ; SPARC-NEXT:    mov %g0, %l3
 ; SPARC-NEXT:  .LBB0_30: ! %start
 ; SPARC-NEXT:    and %l3, %i4, %i1
 ; SPARC-NEXT:    or %i1, %i0, %i0
 ; SPARC-NEXT:    or %i0, %i5, %i0
-; SPARC-NEXT:    or %i0, %l5, %i0
+; SPARC-NEXT:    or %i0, %l4, %i0
 ; SPARC-NEXT:    and %i0, 1, %i4
 ; SPARC-NEXT:    mov %g3, %i0
 ; SPARC-NEXT:    ret
@@ -173,7 +173,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o0, %l0
-; SPARC64-NEXT:    add %o1, %i5, %i0
+; SPARC64-NEXT:    add %o1, %i5, %i5
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i1, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
@@ -181,19 +181,19 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %g0, %i1
 ; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    mov %g0, %i5
 ; SPARC64-NEXT:    mov %g0, %g2
 ; SPARC64-NEXT:    mov %g0, %g3
-; SPARC64-NEXT:    add %o0, %i0, %i0
-; SPARC64-NEXT:    cmp %i0, %o0
+; SPARC64-NEXT:    mov %g0, %g4
+; SPARC64-NEXT:    add %o0, %i5, %i0
+; SPARC64-NEXT:    cmp %i0, %i5
 ; SPARC64-NEXT:    movrnz %l0, 1, %i3
-; SPARC64-NEXT:    movrnz %i2, 1, %i5
-; SPARC64-NEXT:    movrnz %l1, 1, %g2
+; SPARC64-NEXT:    movrnz %i2, 1, %g2
+; SPARC64-NEXT:    movrnz %l1, 1, %g3
 ; SPARC64-NEXT:    movcs %xcc, 1, %i1
-; SPARC64-NEXT:    and %g2, %i5, %i2
+; SPARC64-NEXT:    and %g3, %g2, %i2
 ; SPARC64-NEXT:    or %i2, %i3, %i2
-; SPARC64-NEXT:    movrnz %i4, 1, %g3
-; SPARC64-NEXT:    or %i2, %g3, %i2
+; SPARC64-NEXT:    movrnz %i4, 1, %g4
+; SPARC64-NEXT:    or %i2, %g4, %i2
 ; SPARC64-NEXT:    or %i2, %i1, %i1
 ; SPARC64-NEXT:    srl %i1, 0, %i2
 ; SPARC64-NEXT:    ret
@@ -211,9 +211,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC64-VIS3-NEXT:    mov %g0, %g5
 ; SPARC64-VIS3-NEXT:    mulx %i2, %i1, %i4
 ; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %l0
-; SPARC64-VIS3-NEXT:    add %l0, %i4, %i4
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %l0
-; SPARC64-VIS3-NEXT:    add %l0, %i4, %i4
+; SPARC64-VIS3-NEXT:    add %l0, %i4, %l0
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %i4
+; SPARC64-VIS3-NEXT:    add %i4, %l0, %i4
 ; SPARC64-VIS3-NEXT:    cmp %i4, %l0
 ; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %g2
 ; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %g3
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index 1e537fe64c08d..eb1eb301dfd9c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -118,28 +118,28 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: uadd_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    adds r5, r2, r0
-; CHECK-NEXT:    adc.w lr, r3, r1
-; CHECK-NEXT:    subs r2, r5, r2
-; CHECK-NEXT:    sbcs.w r2, lr, r3
-; CHECK-NEXT:    vmov r3, r12, d2
-; CHECK-NEXT:    vmov r1, r4, d0
-; CHECK-NEXT:    csetm r2, lo
-; CHECK-NEXT:    adds r3, r3, r1
-; CHECK-NEXT:    adc.w r0, r4, r12
-; CHECK-NEXT:    subs r1, r3, r1
-; CHECK-NEXT:    sbcs.w r1, r0, r4
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r5
+; CHECK-NEXT:    adds.w lr, r2, r0
+; CHECK-NEXT:    vmov r2, r4, d0
+; CHECK-NEXT:    adc.w r12, r3, r1
+; CHECK-NEXT:    subs.w r0, lr, r0
+; CHECK-NEXT:    sbcs.w r0, r12, r1
+; CHECK-NEXT:    vmov r1, r3, d2
+; CHECK-NEXT:    csetm r0, lo
+; CHECK-NEXT:    adds r2, r2, r1
+; CHECK-NEXT:    adcs r4, r3
+; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    sbcs.w r1, r4, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
 ; CHECK-NEXT:    csetm r1, lo
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, lr
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %0
diff --git a/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
index 110fb2d43580a..373f4f2077b15 100644
--- a/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
@@ -1,78 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=wasm32 -wasm-keep-registers | FileCheck %s --check-prefixes=WASM32
 ; NOTE: did not compile on wasm64 at the time the test was created!
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
-; WASM32-LABEL: muloti_test
-; WASM32: global.get      $push16=, __stack_pointer
-; WASM32: i32.const       $push17=, 48
-; WASM32: i32.sub         $push38=, $pop16, $pop17
-; WASM32: local.tee       $push37=, 5, $pop38
-; WASM32: global.set      __stack_pointer, $pop37
-; WASM32: local.get       $push39=, 5
-; WASM32: i32.const       $push22=, 32
-; WASM32: i32.add         $push23=, $pop39, $pop22
-; WASM32: local.get       $push41=, 1
-; WASM32: i64.const       $push0=, 0
-; WASM32: local.get       $push40=, 3
-; WASM32: i64.const       $push36=, 0
-; WASM32: call __multi3,  $pop23, $pop41, $pop0, $pop40, $pop36
-; WASM32: local.get       $push42=, 5
-; WASM32: i32.const       $push20=, 16
-; WASM32: i32.add         $push21=, $pop42, $pop20
-; WASM32: local.get       $push44=, 4
-; WASM32: i64.const       $push35=, 0
-; WASM32: local.get       $push43=, 1
-; WASM32: i64.const       $push34=, 0
-; WASM32: call __multi3,  $pop21, $pop44, $pop35, $pop43, $pop34
-; WASM32: local.get       $push47=, 5
-; WASM32: local.get       $push46=, 2
-; WASM32: i64.const       $push33=, 0
-; WASM32: local.get       $push45=, 3
-; WASM32: i64.const       $push32=, 0
-; WASM32: call __multi3,  $pop47, $pop46, $pop33, $pop45, $pop32
-; WASM32: local.get       $push49=, 0
-; WASM32: local.get       $push48=, 5
-; WASM32: i64.load        $push1=, 32($pop48)
-; WASM32: i64.store       0($pop49), $pop1
-; WASM32: local.get       $push53=, 0
-; WASM32: local.get       $push50=, 5
-; WASM32: i64.load        $push31=, 40($pop50)
-; WASM32: local.tee       $push30=, 3, $pop31
-; WASM32: local.get       $push51=, 5
-; WASM32: i64.load        $push3=, 0($pop51)
-; WASM32: local.get       $push52=, 5
-; WASM32: i64.load        $push2=, 16($pop52)
-; WASM32: i64.add         $push4=, $pop3, $pop2
-; WASM32: i64.add         $push29=, $pop30, $pop4
-; WASM32: local.tee       $push28=, 1, $pop29
-; WASM32: i64.store       8($pop53), $pop28
-; WASM32: local.get       $push60=, 0
-; WASM32: local.get       $push54=, 2
-; WASM32: i64.const       $push27=, 0
-; WASM32: i64.ne          $push6=, $pop54, $pop27
-; WASM32: local.get       $push55=, 4
-; WASM32: i64.const       $push26=, 0
-; WASM32: i64.ne          $push5=, $pop55, $pop26
-; WASM32: i32.and         $push7=, $pop6, $pop5
-; WASM32: local.get       $push56=, 5
-; WASM32: i64.load        $push8=, 8($pop56)
-; WASM32: i64.const       $push25=, 0
-; WASM32: i64.ne          $push9=, $pop8, $pop25
-; WASM32: i32.or          $push10=, $pop7, $pop9
-; WASM32: local.get       $push57=, 5
-; WASM32: i64.load        $push11=, 24($pop57)
-; WASM32: i64.const       $push24=, 0
-; WASM32: i64.ne          $push12=, $pop11, $pop24
-; WASM32: i32.or          $push13=, $pop10, $pop12
-; WASM32: local.get       $push59=, 1
-; WASM32: local.get       $push58=, 3
-; WASM32: i64.lt_u        $push14=, $pop59, $pop58
-; WASM32: i32.or          $push15=, $pop13, $pop14
-; WASM32: i32.store8      16($pop60), $pop15
-; WASM32: local.get       $push61=, 5
-; WASM32: i32.const       $push18=, 48
-; WASM32: i32.add         $push19=, $pop61, $pop18
-; WASM32: global.set      __stack_pointer, $pop19
+; WASM32-LABEL: muloti_test:
+; WASM32:         .functype muloti_test (i32, i64, i64, i64, i64) -> ()
+; WASM32-NEXT:    .local i32
+; WASM32-NEXT:  # %bb.0: # %start
+; WASM32-NEXT:    global.get $push16=, __stack_pointer
+; WASM32-NEXT:    i32.const $push17=, 48
+; WASM32-NEXT:    i32.sub $push38=, $pop16, $pop17
+; WASM32-NEXT:    local.tee $push37=, 5, $pop38
+; WASM32-NEXT:    global.set __stack_pointer, $pop37
+; WASM32-NEXT:    local.get $push39=, 5
+; WASM32-NEXT:    i32.const $push22=, 32
+; WASM32-NEXT:    i32.add $push23=, $pop39, $pop22
+; WASM32-NEXT:    local.get $push41=, 1
+; WASM32-NEXT:    i64.const $push0=, 0
+; WASM32-NEXT:    local.get $push40=, 3
+; WASM32-NEXT:    i64.const $push36=, 0
+; WASM32-NEXT:    call __multi3, $pop23, $pop41, $pop0, $pop40, $pop36
+; WASM32-NEXT:    local.get $push42=, 5
+; WASM32-NEXT:    i32.const $push20=, 16
+; WASM32-NEXT:    i32.add $push21=, $pop42, $pop20
+; WASM32-NEXT:    local.get $push44=, 4
+; WASM32-NEXT:    i64.const $push35=, 0
+; WASM32-NEXT:    local.get $push43=, 1
+; WASM32-NEXT:    i64.const $push34=, 0
+; WASM32-NEXT:    call __multi3, $pop21, $pop44, $pop35, $pop43, $pop34
+; WASM32-NEXT:    local.get $push47=, 5
+; WASM32-NEXT:    local.get $push46=, 2
+; WASM32-NEXT:    i64.const $push33=, 0
+; WASM32-NEXT:    local.get $push45=, 3
+; WASM32-NEXT:    i64.const $push32=, 0
+; WASM32-NEXT:    call __multi3, $pop47, $pop46, $pop33, $pop45, $pop32
+; WASM32-NEXT:    local.get $push49=, 0
+; WASM32-NEXT:    local.get $push48=, 5
+; WASM32-NEXT:    i64.load $push1=, 32($pop48)
+; WASM32-NEXT:    i64.store 0($pop49), $pop1
+; WASM32-NEXT:    local.get $push53=, 0
+; WASM32-NEXT:    local.get $push50=, 5
+; WASM32-NEXT:    i64.load $push4=, 40($pop50)
+; WASM32-NEXT:    local.get $push51=, 5
+; WASM32-NEXT:    i64.load $push3=, 0($pop51)
+; WASM32-NEXT:    local.get $push52=, 5
+; WASM32-NEXT:    i64.load $push2=, 16($pop52)
+; WASM32-NEXT:    i64.add $push31=, $pop3, $pop2
+; WASM32-NEXT:    local.tee $push30=, 3, $pop31
+; WASM32-NEXT:    i64.add $push29=, $pop4, $pop30
+; WASM32-NEXT:    local.tee $push28=, 1, $pop29
+; WASM32-NEXT:    i64.store 8($pop53), $pop28
+; WASM32-NEXT:    local.get $push60=, 0
+; WASM32-NEXT:    local.get $push54=, 2
+; WASM32-NEXT:    i64.const $push27=, 0
+; WASM32-NEXT:    i64.ne $push7=, $pop54, $pop27
+; WASM32-NEXT:    local.get $push55=, 4
+; WASM32-NEXT:    i64.const $push26=, 0
+; WASM32-NEXT:    i64.ne $push6=, $pop55, $pop26
+; WASM32-NEXT:    i32.and $push8=, $pop7, $pop6
+; WASM32-NEXT:    local.get $push56=, 5
+; WASM32-NEXT:    i64.load $push9=, 8($pop56)
+; WASM32-NEXT:    i64.const $push25=, 0
+; WASM32-NEXT:    i64.ne $push10=, $pop9, $pop25
+; WASM32-NEXT:    i32.or $push11=, $pop8, $pop10
+; WASM32-NEXT:    local.get $push57=, 5
+; WASM32-NEXT:    i64.load $push12=, 24($pop57)
+; WASM32-NEXT:    i64.const $push24=, 0
+; WASM32-NEXT:    i64.ne $push13=, $pop12, $pop24
+; WASM32-NEXT:    i32.or $push14=, $pop11, $pop13
+; WASM32-NEXT:    local.get $push59=, 1
+; WASM32-NEXT:    local.get $push58=, 3
+; WASM32-NEXT:    i64.lt_u $push5=, $pop59, $pop58
+; WASM32-NEXT:    i32.or $push15=, $pop14, $pop5
+; WASM32-NEXT:    i32.store8 16($pop60), $pop15
+; WASM32-NEXT:    local.get $push61=, 5
+; WASM32-NEXT:    i32.const $push18=, 48
+; WASM32-NEXT:    i32.add $push19=, $pop61, $pop18
+; WASM32-NEXT:    global.set __stack_pointer, $pop19
+; WASM32-NEXT:    # fallthrough-return
 
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index dbfa69d497698..683442c88b43b 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -1782,12 +1782,11 @@ define <4 x i32> @vp_uadd_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i
 ; SSE-LABEL: vp_uadd_sat_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pxor %xmm2, %xmm3
 ; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: vp_uadd_sat_v4i32:
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index b12be7cb129d3..bb6104539108a 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -711,21 +711,19 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
 ;
 ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pxor %xmm1, %xmm2
 ; SSE42-NEXT:    paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; SSE42-NEXT:    pxor %xmm0, %xmm1
+; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
 ; SSE42-NEXT:    por %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -770,21 +768,19 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
 ;
 ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pxor %xmm1, %xmm2
 ; SSE42-NEXT:    paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; SSE42-NEXT:    pxor %xmm0, %xmm1
+; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
 ; SSE42-NEXT:    por %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -1251,18 +1247,17 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
 ; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    paddq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm1
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
-; SSE42-NEXT:    pcmpgtq %xmm2, %xmm3
-; SSE42-NEXT:    por %xmm3, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT:    por %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index d744ce6ed6af0..20941042ea7a7 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -540,23 +540,21 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
 ; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
@@ -608,23 +606,21 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
 ; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT:    por %xmm3, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
@@ -676,37 +672,33 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
 ; SSSE3-NEXT:    paddd %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    paddd %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
@@ -767,65 +759,57 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm9
-; SSE2-NEXT:    por %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm1
 ; SSE2-NEXT:    paddd %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    por %xmm6, %xmm2
 ; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm7
 ; SSE2-NEXT:    pxor %xmm3, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT:    por %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    pxor %xmm8, %xmm9
 ; SSSE3-NEXT:    paddd %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm9
-; SSSE3-NEXT:    por %xmm9, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm9
+; SSSE3-NEXT:    pxor %xmm8, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm0
 ; SSSE3-NEXT:    paddd %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT:    por %xmm5, %xmm1
 ; SSSE3-NEXT:    paddd %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    por %xmm6, %xmm2
 ; SSSE3-NEXT:    paddd %xmm7, %xmm3
+; SSSE3-NEXT:    pxor %xmm8, %xmm7
 ; SSSE3-NEXT:    pxor %xmm3, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSSE3-NEXT:    por %xmm7, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
@@ -897,26 +881,25 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE-LABEL: v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pxor %xmm2, %xmm3
 ; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
@@ -926,7 +909,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; AVX2-LABEL: v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
@@ -959,47 +942,45 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE-LABEL: v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    pxor %xmm4, %xmm5
 ; SSE-NEXT:    paddq %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    movdqa %xmm5, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pxor %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm6
 ; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm5, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
 ; SSE-NEXT:    pand %xmm7, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSE-NEXT:    por %xmm5, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pxor %xmm4, %xmm2
 ; SSE-NEXT:    paddq %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm4, %xmm3
 ; SSE-NEXT:    pxor %xmm1, %xmm4
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE-NEXT:    pand %xmm5, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
@@ -1010,7 +991,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm1
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm3, %ymm1
@@ -1042,88 +1023,84 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE-LABEL: v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    movdqa %xmm0, %xmm9
-; SSE-NEXT:    pxor %xmm8, %xmm9
 ; SSE-NEXT:    paddq %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm4
 ; SSE-NEXT:    pxor %xmm8, %xmm4
-; SSE-NEXT:    movdqa %xmm9, %xmm10
-; SSE-NEXT:    pcmpgtd %xmm4, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm9
+; SSE-NEXT:    pxor %xmm8, %xmm9
+; SSE-NEXT:    movdqa %xmm4, %xmm10
+; SSE-NEXT:    pcmpgtd %xmm9, %xmm10
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm9, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm9
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
 ; SSE-NEXT:    pand %xmm11, %xmm4
 ; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
 ; SSE-NEXT:    por %xmm9, %xmm0
 ; SSE-NEXT:    por %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    pxor %xmm8, %xmm4
 ; SSE-NEXT:    paddq %xmm5, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm5
 ; SSE-NEXT:    pxor %xmm8, %xmm5
-; SSE-NEXT:    movdqa %xmm4, %xmm9
-; SSE-NEXT:    pcmpgtd %xmm5, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pxor %xmm8, %xmm4
+; SSE-NEXT:    movdqa %xmm5, %xmm9
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm9
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE-NEXT:    pand %xmm10, %xmm4
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
 ; SSE-NEXT:    por %xmm5, %xmm1
 ; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    paddq %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm8, %xmm6
 ; SSE-NEXT:    movdqa %xmm2, %xmm4
 ; SSE-NEXT:    pxor %xmm8, %xmm4
-; SSE-NEXT:    paddq %xmm6, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm5
-; SSE-NEXT:    pxor %xmm8, %xmm5
-; SSE-NEXT:    movdqa %xmm4, %xmm6
-; SSE-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE-NEXT:    movdqa %xmm6, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE-NEXT:    pand %xmm9, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE-NEXT:    por %xmm5, %xmm2
 ; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pxor %xmm8, %xmm4
 ; SSE-NEXT:    paddq %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm8, %xmm7
 ; SSE-NEXT:    pxor %xmm3, %xmm8
-; SSE-NEXT:    movdqa %xmm4, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm8
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSE-NEXT:    pand %xmm6, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT:    por %xmm5, %xmm3
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm8, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm7, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE-NEXT:    por %xmm4, %xmm3
+; SSE-NEXT:    por %xmm6, %xmm3
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm5 = mem[0,0]
 ; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vpaddq %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
 ; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm6
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
 ; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpaddq %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm4
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
@@ -1134,12 +1111,12 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX2-LABEL: v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm5
 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm5, %ymm2
 ; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm2
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm3
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index be7888cd76a6b..4febf7d7128ce 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -43,50 +43,53 @@ define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: uaddo_v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    movq %xmm1, (%rdi)
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: uaddo_v2i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    movq %xmm1, (%rdi)
+; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: uaddo_v2i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    pmaxud %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movq %xmm1, (%rdi)
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: uaddo_v2i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm1, (%rdi)
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxud %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm2, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v2i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpltud %xmm1, %xmm2, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    vmovq %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
   %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -100,57 +103,60 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: uaddo_v3i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    movq %xmm1, (%rdi)
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movd %xmm1, 8(%rdi)
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    movq %xmm0, (%rdi)
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movd %xmm0, 8(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: uaddo_v3i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    movq %xmm1, (%rdi)
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
+; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    movq %xmm0, (%rdi)
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSSE3-NEXT:    movd %xmm0, 8(%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: uaddo_v3i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    pmaxud %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    pextrd $2, %xmm1, 8(%rdi)
-; SSE41-NEXT:    movq %xmm1, (%rdi)
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT:    movq %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: uaddo_v3i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
-; AVX-NEXT:    vmovq %xmm1, (%rdi)
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxud %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpextrd $2, %xmm2, 8(%rdi)
+; AVX-NEXT:    vmovq %xmm2, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v3i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpltud %xmm1, %xmm2, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
-; AVX512-NEXT:    vmovq %xmm1, (%rdi)
+; AVX512-NEXT:    vpextrd $2, %xmm2, 8(%rdi)
+; AVX512-NEXT:    vmovq %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
   %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -164,50 +170,53 @@ define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: uaddo_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, (%rdi)
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: uaddo_v4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
+; SSSE3-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: uaddo_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    pmaxud %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, (%rdi)
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: uaddo_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxud %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpltud %xmm1, %xmm2, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
   %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
@@ -222,38 +231,38 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movd %r8d, %xmm0
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movd %edx, %xmm3
-; SSE2-NEXT:    movd %esi, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd %r9d, %xmm2
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    movd %r9d, %xmm3
 ; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, (%rcx)
-; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm3
 ; SSE2-NEXT:    movq %xmm3, 16(%rcx)
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movq %xmm2, 16(%rdi)
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    movq %xmm1, 16(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
 ;
@@ -261,38 +270,38 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSSE3-NEXT:    movd %r8d, %xmm0
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT:    movd %edx, %xmm3
-; SSSE3-NEXT:    movd %esi, %xmm0
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    movd %r8d, %xmm1
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT:    movd %edx, %xmm1
+; SSSE3-NEXT:    movd %esi, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSSE3-NEXT:    movd %r9d, %xmm2
+; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT:    movd %r9d, %xmm3
 ; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    paddd %xmm0, %xmm2
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm1, (%rcx)
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, (%rcx)
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    paddd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT:    paddd %xmm1, %xmm3
 ; SSSE3-NEXT:    movq %xmm3, 16(%rcx)
 ; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    movq %xmm2, 16(%rdi)
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    movq %xmm1, 16(%rdi)
 ; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
@@ -312,60 +321,60 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT:    paddd %xmm0, %xmm3
-; SSE41-NEXT:    pmaxud %xmm3, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    pmaxud %xmm0, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    paddd %xmm2, %xmm1
-; SSE41-NEXT:    pmaxud %xmm1, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
-; SSE41-NEXT:    pxor %xmm4, %xmm2
-; SSE41-NEXT:    movq %xmm1, 16(%rcx)
-; SSE41-NEXT:    movdqa %xmm3, (%rcx)
-; SSE41-NEXT:    movq %xmm2, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    pmaxud %xmm2, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    movq %xmm2, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm0, (%rcx)
+; SSE41-NEXT:    movq %xmm1, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm3, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uaddo_v6i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm5, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovq %xmm3, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm5, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: uaddo_v6i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
-; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
+; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v6i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpcmpltud %ymm1, %ymm2, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, 16(%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
   %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -379,83 +388,89 @@ define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: uaddo_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm2, (%rdi)
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: uaddo_v8i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
+; SSSE3-NEXT:    paddd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT:    paddd %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT:    paddd %xmm3, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm1, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: uaddo_v8i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm0, %xmm2
-; SSE41-NEXT:    pmaxud %xmm2, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    pmaxud %xmm0, %xmm2
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT:    pxor %xmm4, %xmm0
-; SSE41-NEXT:    paddd %xmm1, %xmm3
-; SSE41-NEXT:    pmaxud %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    pxor %xmm4, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    pmaxud %xmm1, %xmm3
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uaddo_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm5, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa %xmm3, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm5, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: uaddo_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v8i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpcmpltud %ymm1, %ymm2, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %ymm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
   %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
@@ -469,143 +484,155 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 ; SSE2-LABEL: uaddo_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, (%rdi)
+; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    paddd %xmm6, %xmm2
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pxor %xmm7, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT:    movdqa %xmm7, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pxor %xmm3, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: uaddo_v16i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    paddd %xmm0, %xmm4
-; SSSE3-NEXT:    pxor %xmm8, %xmm0
-; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
+; SSSE3-NEXT:    paddd %xmm4, %xmm0
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
-; SSSE3-NEXT:    paddd %xmm1, %xmm5
-; SSSE3-NEXT:    pxor %xmm8, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    paddd %xmm5, %xmm1
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
-; SSSE3-NEXT:    paddd %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm8, %xmm2
-; SSSE3-NEXT:    movdqa %xmm6, 32(%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT:    paddd %xmm6, %xmm2
 ; SSSE3-NEXT:    pxor %xmm8, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSSE3-NEXT:    paddd %xmm3, %xmm7
-; SSSE3-NEXT:    pxor %xmm8, %xmm3
-; SSSE3-NEXT:    pxor %xmm7, %xmm8
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSSE3-NEXT:    movdqa %xmm7, 48(%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSSE3-NEXT:    pxor %xmm8, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT:    paddd %xmm7, %xmm3
+; SSSE3-NEXT:    pxor %xmm8, %xmm7
+; SSSE3-NEXT:    pxor %xmm3, %xmm8
+; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm6, %xmm2
+; SSSE3-NEXT:    movdqa %xmm7, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: uaddo_v16i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm0, %xmm4
-; SSE41-NEXT:    pmaxud %xmm4, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT:    paddd %xmm4, %xmm0
+; SSE41-NEXT:    pmaxud %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm8, %xmm8
-; SSE41-NEXT:    pxor %xmm8, %xmm0
-; SSE41-NEXT:    paddd %xmm1, %xmm5
-; SSE41-NEXT:    pmaxud %xmm5, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT:    pxor %xmm8, %xmm1
-; SSE41-NEXT:    paddd %xmm2, %xmm6
-; SSE41-NEXT:    pmaxud %xmm6, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm6, %xmm2
-; SSE41-NEXT:    pxor %xmm8, %xmm2
-; SSE41-NEXT:    paddd %xmm3, %xmm7
-; SSE41-NEXT:    pmaxud %xmm7, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT:    pxor %xmm8, %xmm3
-; SSE41-NEXT:    movdqa %xmm7, 48(%rdi)
-; SSE41-NEXT:    movdqa %xmm6, 32(%rdi)
-; SSE41-NEXT:    movdqa %xmm5, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    pxor %xmm8, %xmm4
+; SSE41-NEXT:    paddd %xmm5, %xmm1
+; SSE41-NEXT:    pmaxud %xmm1, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm8, %xmm5
+; SSE41-NEXT:    paddd %xmm6, %xmm2
+; SSE41-NEXT:    pmaxud %xmm2, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm8, %xmm6
+; SSE41-NEXT:    paddd %xmm7, %xmm3
+; SSE41-NEXT:    pmaxud %xmm3, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
+; SSE41-NEXT:    pxor %xmm7, %xmm8
+; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    movdqa %xmm6, %xmm2
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uaddo_v16i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpmaxud %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm6, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddd %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpmaxud %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm7
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm7, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm7, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm7
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovdqa %xmm4, 48(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm6, 32(%rdi)
 ; AVX1-NEXT:    vmovdqa %xmm5, 16(%rdi)
-; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT:    vmovdqa %xmm7, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: uaddo_v16i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT:    vpmaxud %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpmaxud %ymm3, %ymm4, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpmaxud %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm5, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm0
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
-; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm4, 32(%rdi)
+; AVX2-NEXT:    vmovdqa %ymm5, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpcmpltud %zmm0, %zmm1, %k1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpcmpltud %zmm1, %zmm2, %k1
 ; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
   %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
@@ -618,19 +645,19 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
 define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: uaddo_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pmaxub %xmm1, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pslld $31, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
@@ -639,25 +666,25 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pslld $31, %xmm3
 ; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, (%rdi)
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: uaddo_v16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    paddb %xmm0, %xmm1
-; SSSE3-NEXT:    pmaxub %xmm1, %xmm0
-; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT:    paddb %xmm1, %xmm0
+; SSSE3-NEXT:    pmaxub %xmm0, %xmm1
+; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    pxor %xmm0, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    pslld $31, %xmm4
-; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
@@ -666,22 +693,22 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    pslld $31, %xmm3
 ; SSSE3-NEXT:    psrad $31, %xmm3
-; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
-; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: uaddo_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    pmaxub %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pmaxub %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE41-NEXT:    pslld $31, %xmm4
-; SSE41-NEXT:    psrad $31, %xmm4
+; SSE41-NEXT:    pxor %xmm1, %xmm3
+; SSE41-NEXT:    pmovsxbd %xmm3, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
 ; SSE41-NEXT:    pslld $31, %xmm2
@@ -690,14 +717,14 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 ; SSE41-NEXT:    pslld $31, %xmm3
 ; SSE41-NEXT:    psrad $31, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, (%rdi)
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uaddo_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpmaxub %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
@@ -716,7 +743,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX2-LABEL: uaddo_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpmaxub %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpmaxub %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
@@ -728,10 +755,10 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpltub %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpltub %xmm1, %xmm2, %k1
 ; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
   %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -744,84 +771,82 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ; SSE2-LABEL: uaddo_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pslld $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pslld $31, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, (%rdi)
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: uaddo_v8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    paddw %xmm0, %xmm1
-; SSSE3-NEXT:    pxor %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm2
+; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pslld $31, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    pslld $31, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
-; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: uaddo_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm2
-; SSE41-NEXT:    pmovsxwd %xmm2, %xmm0
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    pslld $31, %xmm2
-; SSE41-NEXT:    psrad $31, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, (%rdi)
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pslld $31, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: uaddo_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: uaddo_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpmaxuw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpmaxuw %xmm1, %xmm2, %xmm0
+; AVX2-NEXT:    vpcmpeqw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: uaddo_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpltuw %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpltuw %xmm1, %xmm2, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
   %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
@@ -835,25 +860,26 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; SSE-LABEL: uaddo_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
 ; SSE-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3]
 ; SSE-NEXT:    pand %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3]
-; SSE-NEXT:    por %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm1, (%rdi)
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,3,3]
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm0, (%rdi)
+; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: uaddo_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
@@ -864,7 +890,7 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; AVX2-LABEL: uaddo_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0
@@ -874,11 +900,11 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v2i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpcmpltuq %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpcmpltuq %xmm1, %xmm2, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rdi)
 ; AVX512-NEXT:    retq
   %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
   %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0

>From dae0ff7a2f78ba086ac13777be0a081d6a3b59a9 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Thu, 11 Dec 2025 20:42:24 +0530
Subject: [PATCH 8/8] resolve merge conflict

---
 llvm/test/CodeGen/RISCV/rvv/vp-splat.ll | 744 ------------------------
 1 file changed, 744 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/rvv/vp-splat.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
deleted file mode 100644
index aeee1fa8215f0..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ /dev/null
@@ -1,744 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFHMIN
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZFMIN
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZFMIN
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVFBFA
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVFBFA
-
-define <vscale x 1 x i8> @vp_splat_nxv1i8(i8 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 1 x i8> @llvm.experimental.vp.splat.nxv1i8(i8 %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x i8> %splat
-}
-
-define <vscale x 2 x i8> @vp_splat_nxv2i8(i8 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 2 x i8> @llvm.experimental.vp.splat.nxv2i8(i8 %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x i8> %splat
-}
-
-define <vscale x 4 x i8> @vp_splat_nxv4i8(i8 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 4 x i8> @llvm.experimental.vp.splat.nxv4i8(i8 %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x i8> %splat
-}
-
-define <vscale x 8 x i8> @vp_splat_nxv8i8(i8 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x i8> @llvm.experimental.vp.splat.nxv8i8(i8 %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x i8> %splat
-}
-
-define <vscale x 16 x i8> @vp_splat_nxv16i8(i8 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 16 x i8> @llvm.experimental.vp.splat.nxv16i8(i8 %val, <vscale x 16 x i1> %m, i32 %evl)
-  ret <vscale x 16 x i8> %splat
-}
-
-define <vscale x 32 x i8> @vp_splat_nxv32i8(i8 %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 32 x i8> @llvm.experimental.vp.splat.nxv32i8(i8 %val, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x i8> %splat
-}
-
-define <vscale x 64 x i8> @vp_splat_nxv64i8(i8 %val, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv64i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 64 x i8> @llvm.experimental.vp.splat.nxv64i8(i8 %val, <vscale x 64 x i1> %m, i32 %evl)
-  ret <vscale x 64 x i8> %splat
-}
-
-define <vscale x 1 x i16> @vp_splat_nxv1i16(i16 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 1 x i16> @llvm.experimental.vp.splat.nxv1i16(i16 %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x i16> %splat
-}
-
-define <vscale x 2 x i16> @vp_splat_nxv2i16(i16 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 2 x i16> @llvm.experimental.vp.splat.nxv2i16(i16 %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x i16> %splat
-}
-
-define <vscale x 4 x i16> @vp_splat_nxv4i16(i16 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 4 x i16> @llvm.experimental.vp.splat.nxv4i16(i16 %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x i16> %splat
-}
-
-define <vscale x 8 x i16> @vp_splat_nxv8i16(i16 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x i16> @llvm.experimental.vp.splat.nxv8i16(i16 %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x i16> %splat
-}
-
-define <vscale x 16 x i16> @vp_splat_nxv16i16(i16 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 16 x i16> @llvm.experimental.vp.splat.nxv16i16(i16 %val, <vscale x 16 x i1> %m, i32 %evl)
-  ret <vscale x 16 x i16> %splat
-}
-
-define <vscale x 32 x i16> @vp_splat_nxv32i16(i16 %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv32i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 32 x i16> @llvm.experimental.vp.splat.nxv32i16(i16 %val, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x i16> %splat
-}
-
-define <vscale x 1 x i32> @vp_splat_nxv1i32(i32 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 1 x i32> @llvm.experimental.vp.splat.nxv1i32(i32 %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x i32> %splat
-}
-
-define <vscale x 2 x i32> @vp_splat_nxv2i32(i32 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 2 x i32> @llvm.experimental.vp.splat.nxv2i32(i32 %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x i32> %splat
-}
-
-define <vscale x 4 x i32> @vp_splat_nxv4i32(i32 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 4 x i32> @llvm.experimental.vp.splat.nxv4i32(i32 %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x i32> %splat
-}
-
-define <vscale x 8 x i32> @vp_splat_nxv8i32(i32 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x i32> @llvm.experimental.vp.splat.nxv8i32(i32 %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x i32> %splat
-}
-
-define <vscale x 16 x i32> @vp_splat_nxv16i32(i32 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 16 x i32> @llvm.experimental.vp.splat.nxv16i32(i32 %val, <vscale x 16 x i1> %m, i32 %evl)
-  ret <vscale x 16 x i32> %splat
-}
-
-define <vscale x 1 x i64> @vp_splat_nxv1i64(i64 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_splat_nxv1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
-  %splat = call <vscale x 1 x i64> @llvm.experimental.vp.splat.nxv1i64(i64 %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x i64> %splat
-}
-
-define <vscale x 2 x i64> @vp_splat_nxv2i64(i64 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_splat_nxv2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
-  %splat = call <vscale x 2 x i64> @llvm.experimental.vp.splat.nxv2i64(i64 %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x i64> %splat
-}
-
-define <vscale x 4 x i64> @vp_splat_nxv4i64(i64 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_splat_nxv4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
-  %splat = call <vscale x 4 x i64> @llvm.experimental.vp.splat.nxv4i64(i64 %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x i64> %splat
-}
-
-define <vscale x 8 x i64> @vp_splat_nxv8i64(i64 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vp_splat_nxv8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
-  %splat = call <vscale x 8 x i64> @llvm.experimental.vp.splat.nxv8i64(i64 %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x i64> %splat
-}
-
-define <vscale x 1 x bfloat> @vp_splat_nxv1bf16(bfloat %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv1bf16:
-; NOZFMIN:       # %bb.0:
-; NOZFMIN-NEXT:    fmv.x.w a1, fa0
-; NOZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; NOZFMIN-NEXT:    vmv.v.x v8, a1
-; NOZFMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv1bf16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv1bf16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
-; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 1 x bfloat> @llvm.experimental.vp.splat.nxv1bf16(bfloat %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x bfloat> %splat
-}
-
-define <vscale x 2 x bfloat> @vp_splat_nxv2bf16(bfloat %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv2bf16:
-; NOZFMIN:       # %bb.0:
-; NOZFMIN-NEXT:    fmv.x.w a1, fa0
-; NOZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; NOZFMIN-NEXT:    vmv.v.x v8, a1
-; NOZFMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv2bf16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv2bf16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
-; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 2 x bfloat> @llvm.experimental.vp.splat.nxv2bf16(bfloat %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x bfloat> %splat
-}
-
-define <vscale x 4 x bfloat> @vp_splat_nxv4bf16(bfloat %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv4bf16:
-; NOZFMIN:       # %bb.0:
-; NOZFMIN-NEXT:    fmv.x.w a1, fa0
-; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; NOZFMIN-NEXT:    vmv.v.x v8, a1
-; NOZFMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv4bf16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv4bf16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
-; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 4 x bfloat> @llvm.experimental.vp.splat.nxv4bf16(bfloat %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x bfloat> %splat
-}
-
-define <vscale x 8 x bfloat> @vp_splat_nxv8bf16(bfloat %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv8bf16:
-; NOZFMIN:       # %bb.0:
-; NOZFMIN-NEXT:    fmv.x.w a1, fa0
-; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; NOZFMIN-NEXT:    vmv.v.x v8, a1
-; NOZFMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv8bf16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv8bf16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
-; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 8 x bfloat> @llvm.experimental.vp.splat.nxv8bf16(bfloat %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x bfloat> %splat
-}
-
-define <vscale x 16 x bfloat> @vp_splat_nxv16bf16(bfloat %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv16bf16:
-; NOZFMIN:       # %bb.0:
-; NOZFMIN-NEXT:    fmv.x.w a1, fa0
-; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; NOZFMIN-NEXT:    vmv.v.x v8, a1
-; NOZFMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv16bf16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv16bf16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 16 x bfloat> @llvm.experimental.vp.splat.nxv16bf16(bfloat %val, <vscale x 16 x i1> %m, i32 %evl)
-  ret <vscale x 16 x bfloat> %splat
-}
-
-define <vscale x 32 x bfloat> @vp_splat_nxv32bf16(bfloat %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv32bf16:
-; NOZFMIN:       # %bb.0:
-; NOZFMIN-NEXT:    fmv.x.w a1, fa0
-; NOZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; NOZFMIN-NEXT:    vmv.v.x v8, a1
-; NOZFMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv32bf16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv32bf16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
-; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 32 x bfloat> @llvm.experimental.vp.splat.nxv32bf16(bfloat %val, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x bfloat> %splat
-}
-
-define <vscale x 1 x half> @vp_splat_nxv1f16(half %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv1f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFH-NEXT:    vfmv.v.f v8, fa0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv1f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv1f16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv1f16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v8, a1
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 1 x half> @llvm.experimental.vp.splat.nxv1f16(half %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x half> %splat
-}
-
-define <vscale x 2 x half> @vp_splat_nxv2f16(half %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv2f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfmv.v.f v8, fa0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv2f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv2f16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv2f16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v8, a1
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 2 x half> @llvm.experimental.vp.splat.nxv2f16(half %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x half> %splat
-}
-
-define <vscale x 4 x half> @vp_splat_nxv4f16(half %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv4f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFH-NEXT:    vfmv.v.f v8, fa0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv4f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv4f16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv4f16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v8, a1
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 4 x half> @llvm.experimental.vp.splat.nxv4f16(half %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x half> %splat
-}
-
-define <vscale x 8 x half> @vp_splat_nxv8f16(half %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv8f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT:    vfmv.v.f v8, fa0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv8f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv8f16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv8f16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v8, a1
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 8 x half> @llvm.experimental.vp.splat.nxv8f16(half %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x half> %splat
-}
-
-define <vscale x 16 x half> @vp_splat_nxv16f16(half %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv16f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfmv.v.f v8, fa0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv16f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv16f16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv16f16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v8, a1
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 16 x half> @llvm.experimental.vp.splat.nxv16f16(half %val, <vscale x 16 x i1> %m, i32 %evl)
-  ret <vscale x 16 x half> %splat
-}
-
-define <vscale x 32 x half> @vp_splat_nxv32f16(half %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv32f16:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vfmv.v.f v8, fa0
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv32f16:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    ret
-;
-; ZFMIN-LABEL: vp_splat_nxv32f16:
-; ZFMIN:       # %bb.0:
-; ZFMIN-NEXT:    fmv.x.h a1, fa0
-; ZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZFMIN-NEXT:    vmv.v.x v8, a1
-; ZFMIN-NEXT:    ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv32f16:
-; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
-; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v8, a1
-; ZVFBFA-NEXT:    ret
-  %splat = call <vscale x 32 x half> @llvm.experimental.vp.splat.nxv32f16(half %val, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x half> %splat
-}
-
-define <vscale x 1 x float> @vp_splat_nxv1f32(float %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 1 x float> @llvm.experimental.vp.splat.nxv1f32(float %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x float> %splat
-}
-
-define <vscale x 2 x float> @vp_splat_nxv2f32(float %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 2 x float> @llvm.experimental.vp.splat.nxv2f32(float %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x float> %splat
-}
-
-define <vscale x 4 x float> @vp_splat_nxv4f32(float %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 4 x float> @llvm.experimental.vp.splat.nxv4f32(float %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x float> %splat
-}
-
-define <vscale x 8 x float> @vp_splat_nxv8f32(float %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x float> @llvm.experimental.vp.splat.nxv8f32(float %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x float> %splat
-}
-
-define <vscale x 16 x float> @vp_splat_nxv16f32(float %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 16 x float> @llvm.experimental.vp.splat.nxv16f32(float %val, <vscale x 16 x i1> %m, i32 %evl)
-  ret <vscale x 16 x float> %splat
-}
-
-define <vscale x 1 x double> @vp_splat_nxv1f64(double %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 1 x double> @llvm.experimental.vp.splat.nxv1f64(double %val, <vscale x 1 x i1> %m, i32 %evl)
-  ret <vscale x 1 x double> %splat
-}
-
-define <vscale x 2 x double> @vp_splat_nxv2f64(double %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 2 x double> @llvm.experimental.vp.splat.nxv2f64(double %val, <vscale x 2 x i1> %m, i32 %evl)
-  ret <vscale x 2 x double> %splat
-}
-
-define <vscale x 4 x double> @vp_splat_nxv4f64(double %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 4 x double> @llvm.experimental.vp.splat.nxv4f64(double %val, <vscale x 4 x i1> %m, i32 %evl)
-  ret <vscale x 4 x double> %splat
-}
-
-define <vscale x 8 x double> @vp_splat_nxv8f64(double %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmv.v.f v8, fa0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x double> @llvm.experimental.vp.splat.nxv8f64(double %val, <vscale x 8 x i1> %m, i32 %evl)
-  ret <vscale x 8 x double> %splat
-}
-
-define <vscale x 16 x i31> @vp_splat_nxv16i31(i31 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i31:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 16 x i31> @llvm.experimental.vp.splat.nxv16i31(i31 %val, <vscale x 16 x i1> %m, i32 %evl)
-  ret <vscale x 16 x i31> %splat
-}
-
-define <vscale x 15 x i32> @vp_splat_nxv15i32(i32 %val, <vscale x 15 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv15i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 15 x i32> @llvm.experimental.vp.splat.nxv15i32(i32 %val, <vscale x 15 x i1> %m, i32 %evl)
-  ret <vscale x 15 x i32> %splat
-}
-
-; Split case.
-define <vscale x 32 x i32> @vp_splat_nxv32i32(i32 %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv32i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    bltu a1, a2, .LBB45_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a1, a2
-; CHECK-NEXT:  .LBB45_2:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-  %splat = call <vscale x 32 x i32> @llvm.experimental.vp.splat.nxv32i32(i32 %val, <vscale x 32 x i1> %m, i32 %evl)
-  ret <vscale x 32 x i32> %splat
-}



More information about the llvm-commits mailing list