[llvm] [Instcombine] Lower to explicit subtraction + unsigned comparison (PR #170896)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Dec 6 09:46:37 PST 2025
https://github.com/aabhinavg1 updated https://github.com/llvm/llvm-project/pull/170896
>From b268573d6a03fe14e22a7703dcd6a284e5d0ca9a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:43:14 +0530
Subject: [PATCH 1/4] [Instcombine] Lower to explicit subtraction + unsigned
comparison
---
.../InstCombine/InstCombineCalls.cpp | 13 ++++
.../test/Transforms/InstCombine/known-bits.ll | 15 +++--
llvm/test/Transforms/InstCombine/pr170634.ll | 33 ++++++++++
...ult-of-usub-is-non-zero-and-no-overflow.ll | 60 +++++++++----------
.../usub-overflow-known-by-implied-cond.ll | 40 +++++--------
llvm/test/Transforms/InstCombine/usubo.ll | 10 ++--
.../Transforms/InstCombine/with_overflow.ll | 7 ++-
7 files changed, 108 insertions(+), 70 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/pr170634.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 743c4f574e131..af85985843914 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,6 +864,19 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
WO->getRHS(), *WO, OperationResult, OverflowResult))
return createOverflowTuple(WO, OperationResult, OverflowResult);
+
+ // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
+ if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
+ IRBuilder<> Builder(WO);
+ Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
+ Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
+
+ Value *ResultStruct = UndefValue::get(WO->getType());
+ ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
+ ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
+
+ return replaceInstUsesWith(*WO, ResultStruct);
+ }
// See whether we can optimize the overflow check with assumption information.
for (User *U : WO->users()) {
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index da2123a5dfe74..fc73ce5503ffe 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
; CHECK-LABEL: @extract_value_usub(
; CHECK-NEXT: [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT: [[SUB:%.*]] = xor i8 [[ZZ]], -1
+; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: ret i1 false
+; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[ZZ]], -1
+; CHECK-NEXT: ret i1 [[R]]
;
%z = add nuw i8 %zz, 1
%y = add i8 %x, %z
@@ -1090,12 +1090,11 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
; CHECK-LABEL: @extract_value_usub_fail(
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[Z]]
+; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[SUB]], 0
+; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[Z]], 0
; CHECK-NEXT: ret i1 [[R]]
;
%y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
new file mode 100644
index 0000000000000..62a332e14b04a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
+; CHECK-LABEL: @func(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: br label [[RETURN:%.*]]
+; CHECK: if.end:
+; CHECK-NEXT: [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT: br label [[RETURN]]
+; CHECK: return:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
+; CHECK-NEXT: ret i64 [[RETVAL_0]]
+;
+entry:
+ %0 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+ %1 = extractvalue { i64, i1 } %0, 1
+ %2 = extractvalue { i64, i1 } %0, 0
+ br i1 %1, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ br label %return
+
+if.end: ; preds = %entry
+ br label %return
+
+return: ; preds = %if.end, %if.then
+ %retval.0 = phi i64 [ 291, %if.then ], [ %2, %if.end ]
+ ret i64 %retval.0
+}
+
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 30a5072c7edc8..46b8a853e6cf5 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
define i1 @t2(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
define i1 @t2_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
define i1 @t6_commutability(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
define i1 @t8(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
define i1 @t8_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index 90ca39a70a0bb..c9030e5ab0321 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,11 +175,10 @@ define i32 @test7(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -205,11 +204,10 @@ define i32 @test8(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -296,11 +294,10 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -328,11 +325,10 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -360,11 +356,10 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -392,11 +387,10 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -424,11 +418,10 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -456,11 +449,10 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index 2074190a2cd45..e4b9c0e08ba22 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,10 +130,9 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use(i1 [[OV]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
; CHECK-NEXT: ret i1 [[EQ1]]
;
@@ -149,10 +148,9 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use(i1 [[OV]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
; CHECK-NEXT: ret i1 [[SGT0]]
;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index fa810408730e1..4f7a15cc89d6c 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; RUN: opt -passes='instcombine<no-verify-fixpoint>' -S < %s | FileCheck %s
declare { i8, i1 } @llvm.uadd.with.overflow.i8(i8, i8) nounwind readnone
declare { i8, i1 } @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
@@ -506,7 +506,10 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT: [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
+; CHECK-NEXT: [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 42, [[X]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT: [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
; CHECK-NEXT: ret { i32, i1 } [[A]]
;
%a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)
>From e3fdf8dd13a1a8c3fc3ea7dd1916762d95276570 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:46:22 +0530
Subject: [PATCH 2/4] formated with git clang-format HEAD~1
---
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index af85985843914..3bd7eb855b147 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,17 +864,17 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
WO->getRHS(), *WO, OperationResult, OverflowResult))
return createOverflowTuple(WO, OperationResult, OverflowResult);
-
- // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
+
+ // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
IRBuilder<> Builder(WO);
Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-
+
Value *ResultStruct = UndefValue::get(WO->getType());
ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-
+
return replaceInstUsesWith(*WO, ResultStruct);
}
>From aeef41f725b96ec57f72c2eb9788735419ae7172 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 00:27:48 +0530
Subject: [PATCH 3/4] fix formatting and replace undef with poison
---
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 2 +-
.../result-of-usub-is-non-zero-and-no-overflow.ll | 12 ++++++------
llvm/test/Transforms/InstCombine/with_overflow.ll | 2 +-
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3bd7eb855b147..d0b71f12c3159 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -871,7 +871,7 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
- Value *ResultStruct = UndefValue::get(WO->getType());
+ Value *ResultStruct = PoisonValue::get(WO->getType());
ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 46b8a853e6cf5..f8b318bc3680a 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -143,7 +143,7 @@ define i1 @t2(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -170,7 +170,7 @@ define i1 @t2_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2_logical(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -323,7 +323,7 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -350,7 +350,7 @@ define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability_logical(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -461,7 +461,7 @@ define i1 @t8(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -484,7 +484,7 @@ define i1 @t8_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8_logical(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 4f7a15cc89d6c..0c82bdc256ddf 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -508,7 +508,7 @@ define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
; CHECK-NEXT: [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
; CHECK-NEXT: ret { i32, i1 } [[A]]
;
>From 87d56d3d369db1fef1789ccbc3f7890e30daa96a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 23:16:05 +0530
Subject: [PATCH 4/4] Address review feedback
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 4 +-
.../InstCombine/InstCombineCalls.cpp | 13 -
.../test/CodeGen/RISCV/arith-with-overflow.ll | 7 +-
.../CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll | 24 +-
llvm/test/CodeGen/RISCV/rvv/abs-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll | 44 +-
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll | 46 +-
llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll | 68 +-
.../CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-bitreverse-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-bswap-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-ceil-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 432 ++-
.../RISCV/rvv/fixed-vectors-ctpop-vp.ll | 76 +-
.../RISCV/rvv/fixed-vectors-cttz-vp.ll | 244 +-
.../RISCV/rvv/fixed-vectors-floor-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-fmaximum-vp.ll | 77 +-
.../RISCV/rvv/fixed-vectors-fminimum-vp.ll | 77 +-
.../RISCV/rvv/fixed-vectors-fpext-vp.ll | 8 +-
.../RISCV/rvv/fixed-vectors-fptosi-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-fptoui-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-fptrunc-vp.ll | 8 +-
.../RISCV/rvv/fixed-vectors-nearbyint-vp.ll | 32 +-
.../rvv/fixed-vectors-reduction-fp-vp.ll | 16 +-
.../rvv/fixed-vectors-reduction-int-vp.ll | 8 +-
.../rvv/fixed-vectors-reduction-mask-vp.ll | 8 +-
.../RISCV/rvv/fixed-vectors-rint-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-round-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-roundeven-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-roundtozero-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-setcc-int-vp.ll | 50 +-
.../RISCV/rvv/fixed-vectors-sext-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-sitofp-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-strided-vpload.ll | 74 +-
.../rvv/fixed-vectors-strided-vpstore.ll | 18 +-
.../RISCV/rvv/fixed-vectors-trunc-vp.ll | 299 +-
.../RISCV/rvv/fixed-vectors-uitofp-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vadd-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vcopysign-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfabs-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfma-vp.ll | 48 +-
.../RISCV/rvv/fixed-vectors-vfmax-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfmin-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll | 48 +-
.../RISCV/rvv/fixed-vectors-vfneg-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfsqrt-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vmax-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vmaxu-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vmin-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vminu-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vpgather.ll | 184 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vpmerge.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vpscatter.ll | 64 +-
.../RISCV/rvv/fixed-vectors-vpstore.ll | 8 +-
.../RISCV/rvv/fixed-vectors-vsadd-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vsaddu-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vselect-vp.ll | 28 +-
.../RISCV/rvv/fixed-vectors-vssub-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vssubu-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-zext-vp.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/floor-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 68 +-
llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 68 +-
llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll | 52 +-
.../RISCV/rvv/nontemporal-vp-scalable.ll | 3010 ++++++++---------
llvm/test/CodeGen/RISCV/rvv/rint-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/round-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll | 869 +++--
llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll | 297 +-
llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll | 50 +-
.../test/CodeGen/RISCV/rvv/strided-vpstore.ll | 82 +-
llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll | 531 +--
llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll | 276 +-
llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll | 2956 ++++------------
llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll | 139 +-
llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll | 12 +-
llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll | 276 +-
llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/vp-splat.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/vp-splice.ll | 140 +-
.../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll | 28 +-
llvm/test/CodeGen/RISCV/rvv/vpload.ll | 34 +-
llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll | 18 +-
.../CodeGen/RISCV/rvv/vpscatter-sdnode.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/vpstore.ll | 20 +-
.../CodeGen/RISCV/rvv/vreductions-fp-vp.ll | 4 +-
.../CodeGen/RISCV/rvv/vreductions-int-vp.ll | 2 +-
.../CodeGen/RISCV/rvv/vreductions-mask-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll | 30 +-
llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll | 12 +-
llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/usub_sat.ll | 48 +-
llvm/test/CodeGen/RISCV/usub_sat_plus.ll | 44 +-
llvm/test/CodeGen/RISCV/xaluo.ll | 129 +-
llvm/test/CodeGen/RISCV/xqcia.ll | 6 +-
.../test/Transforms/InstCombine/known-bits.ll | 15 +-
llvm/test/Transforms/InstCombine/pr170634.ll | 5 +-
...ult-of-usub-is-non-zero-and-no-overflow.ll | 60 +-
.../usub-overflow-known-by-implied-cond.ll | 40 +-
llvm/test/Transforms/InstCombine/usubo.ll | 10 +-
.../Transforms/InstCombine/with_overflow.ll | 5 +-
132 files changed, 5135 insertions(+), 7744 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 172c7485e108b..8b46c4c1e66db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11466,7 +11466,9 @@ void TargetLowering::expandUADDSUBO(
DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETNE);
} else {
ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
- SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
+ SDValue CompareLHS = IsAdd ? Result : LHS;
+ SDValue CompareRHS = IsAdd ? LHS : RHS;
+ SetCC = DAG.getSetCC(dl, SetCCType, CompareLHS, CompareRHS, CC);
}
Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d0b71f12c3159..743c4f574e131 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -865,19 +865,6 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
WO->getRHS(), *WO, OperationResult, OverflowResult))
return createOverflowTuple(WO, OperationResult, OverflowResult);
- // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
- if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
- IRBuilder<> Builder(WO);
- Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
- Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-
- Value *ResultStruct = PoisonValue::get(WO->getType());
- ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
- ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-
- return replaceInstUsesWith(*WO, ResultStruct);
- }
-
// See whether we can optimize the overflow check with assumption information.
for (User *U : WO->users()) {
if (!match(U, m_ExtractValue<1>(m_Value())))
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 557b4b7c2afa2..84526a1fca0f9 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -54,9 +54,10 @@ entry:
define i1 @usub(i32 %a, i32 %b, ptr %c) nounwind {
; RV32I-LABEL: usub:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
-; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: sltu a3, a1, a0
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: mv a0, a3
; RV32I-NEXT: ret
entry:
%x = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index ea9786d0b10b3..f5f122a8c9dd7 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -715,7 +715,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
; RV32I-NEXT: zext.b a0, a3
; RV32I-NEXT: sub a1, a0, s1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sltu a0, s1, a0
; RV32I-NEXT: addi a0, a0, -1
; RV32I-NEXT: and a2, a0, a1
; RV32I-NEXT: sb a3, 3(sp)
@@ -755,7 +755,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV32IA-NEXT: srl a4, a4, a0
; RV32IA-NEXT: zext.b a4, a4
; RV32IA-NEXT: sub a6, a4, a1
-; RV32IA-NEXT: sltu a4, a4, a6
+; RV32IA-NEXT: sltu a4, a1, a4
; RV32IA-NEXT: addi a4, a4, -1
; RV32IA-NEXT: and a4, a4, a6
; RV32IA-NEXT: sll a4, a4, a0
@@ -792,7 +792,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: zext.b a0, a3
; RV64I-NEXT: sub a1, a0, s1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sltu a0, s1, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: sb a3, 7(sp)
@@ -832,7 +832,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV64IA-NEXT: sext.w a6, a3
; RV64IA-NEXT: zext.b a5, a5
; RV64IA-NEXT: sub a7, a5, a1
-; RV64IA-NEXT: sltu a5, a5, a7
+; RV64IA-NEXT: sltu a5, a1, a5
; RV64IA-NEXT: addi a5, a5, -1
; RV64IA-NEXT: and a5, a5, a7
; RV64IA-NEXT: sllw a5, a5, a0
@@ -877,7 +877,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
; RV32I-NEXT: and a0, a3, s1
; RV32I-NEXT: sub a1, a0, s2
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sltu a0, s2, a0
; RV32I-NEXT: addi a0, a0, -1
; RV32I-NEXT: and a2, a0, a1
; RV32I-NEXT: sh a3, 14(sp)
@@ -920,7 +920,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV32IA-NEXT: srl a5, a5, a0
; RV32IA-NEXT: and a5, a5, a3
; RV32IA-NEXT: sub a7, a5, a1
-; RV32IA-NEXT: sltu a5, a5, a7
+; RV32IA-NEXT: sltu a5, a1, a5
; RV32IA-NEXT: addi a5, a5, -1
; RV32IA-NEXT: and a5, a5, a7
; RV32IA-NEXT: sll a5, a5, a0
@@ -961,7 +961,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: and a0, a3, s1
; RV64I-NEXT: sub a1, a0, s2
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sltu a0, s2, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: sh a3, 14(sp)
@@ -1004,7 +1004,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV64IA-NEXT: sext.w a7, a4
; RV64IA-NEXT: and a6, a6, a3
; RV64IA-NEXT: sub t0, a6, a1
-; RV64IA-NEXT: sltu a6, a6, t0
+; RV64IA-NEXT: sltu a6, a1, a6
; RV64IA-NEXT: addi a6, a6, -1
; RV64IA-NEXT: and a6, a6, t0
; RV64IA-NEXT: sllw a6, a6, a0
@@ -1044,7 +1044,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; RV32I-NEXT: .LBB6_1: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
; RV32I-NEXT: sub a0, a3, s1
-; RV32I-NEXT: sltu a1, a3, a0
+; RV32I-NEXT: sltu a1, s1, a3
; RV32I-NEXT: addi a1, a1, -1
; RV32I-NEXT: and a2, a1, a0
; RV32I-NEXT: sw a3, 0(sp)
@@ -1075,7 +1075,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; RV32IA-NEXT: # Child Loop BB6_3 Depth 2
; RV32IA-NEXT: mv a3, a2
; RV32IA-NEXT: sub a2, a2, a1
-; RV32IA-NEXT: sltu a4, a3, a2
+; RV32IA-NEXT: sltu a4, a1, a3
; RV32IA-NEXT: addi a4, a4, -1
; RV32IA-NEXT: and a4, a4, a2
; RV32IA-NEXT: .LBB6_3: # %atomicrmw.start
@@ -1298,7 +1298,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; RV64I-NEXT: .LBB7_1: # %atomicrmw.start
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: sub a0, a3, s1
-; RV64I-NEXT: sltu a1, a3, a0
+; RV64I-NEXT: sltu a1, s1, a3
; RV64I-NEXT: addi a1, a1, -1
; RV64I-NEXT: and a2, a1, a0
; RV64I-NEXT: sd a3, 0(sp)
@@ -1329,7 +1329,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; RV64IA-NEXT: # Child Loop BB7_3 Depth 2
; RV64IA-NEXT: mv a3, a2
; RV64IA-NEXT: sub a2, a2, a1
-; RV64IA-NEXT: sltu a4, a3, a2
+; RV64IA-NEXT: sltu a4, a1, a3
; RV64IA-NEXT: addi a4, a4, -1
; RV64IA-NEXT: and a4, a4, a2
; RV64IA-NEXT: .LBB7_3: # %atomicrmw.start
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 5b215c5173211..0fb4b2a06b76f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -519,7 +519,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -543,7 +543,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64_unmasked(<vscale x 16 x i64> %va, i3
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 09b8fdbf11d26..025f944bcd51c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -3018,7 +3018,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
; CHECK-NEXT: slli a3, a3, 2
; CHECK-NEXT: vslidedown.vx v0, v0, a4
; CHECK-NEXT: sub a4, a0, a3
-; CHECK-NEXT: sltu a5, a0, a4
+; CHECK-NEXT: sltu a5, a3, a0
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a5, a5, a4
; CHECK-NEXT: lui a6, 5
@@ -3079,7 +3079,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
; CHECK-ZVBB-NEXT: slli a1, a1, 2
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -3104,7 +3104,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
; CHECK-NEXT: lui a2, 3
; CHECK-NEXT: slli a3, a3, 2
; CHECK-NEXT: sub a4, a0, a3
-; CHECK-NEXT: sltu a5, a0, a4
+; CHECK-NEXT: sltu a5, a3, a0
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a5, a5, a4
; CHECK-NEXT: lui a6, 5
@@ -3160,7 +3160,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: slli a1, a1, 2
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0177b8cfd4393..668a770610f20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1534,7 +1534,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -1561,7 +1561,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
; CHECK-ZVKB-NEXT: slli a1, a1, 2
; CHECK-ZVKB-NEXT: vslidedown.vx v0, v0, a2
; CHECK-ZVKB-NEXT: sub a2, a0, a1
-; CHECK-ZVKB-NEXT: sltu a3, a0, a2
+; CHECK-ZVKB-NEXT: sltu a3, a1, a0
; CHECK-ZVKB-NEXT: addi a3, a3, -1
; CHECK-ZVKB-NEXT: and a2, a3, a2
; CHECK-ZVKB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -1584,7 +1584,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -1606,7 +1606,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
; CHECK-ZVKB-NEXT: csrr a1, vlenb
; CHECK-ZVKB-NEXT: slli a1, a1, 2
; CHECK-ZVKB-NEXT: sub a2, a0, a1
-; CHECK-ZVKB-NEXT: sltu a3, a0, a2
+; CHECK-ZVKB-NEXT: sltu a3, a1, a0
; CHECK-ZVKB-NEXT: addi a3, a3, -1
; CHECK-ZVKB-NEXT: and a2, a3, a2
; CHECK-ZVKB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 6c7709f52e30b..d3813b703c5be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1585,7 +1585,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
; RV32ZFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZFH-NEXT: sub a2, a0, a1
; RV32ZFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZFH-NEXT: sltu a3, a0, a2
+; RV32ZFH-NEXT: sltu a3, a1, a0
; RV32ZFH-NEXT: addi a3, a3, -1
; RV32ZFH-NEXT: and a2, a3, a2
; RV32ZFH-NEXT: vmv1r.v v0, v6
@@ -1631,7 +1631,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
; RV64ZFH-NEXT: sub a3, a0, a1
; RV64ZFH-NEXT: slli a2, a2, 52
; RV64ZFH-NEXT: fmv.d.x fa5, a2
-; RV64ZFH-NEXT: sltu a2, a0, a3
+; RV64ZFH-NEXT: sltu a2, a1, a0
; RV64ZFH-NEXT: addi a2, a2, -1
; RV64ZFH-NEXT: and a2, a2, a3
; RV64ZFH-NEXT: vmv1r.v v0, v6
@@ -1676,7 +1676,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
; RV32ZFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZFH-NEXT: sub a3, a0, a1
; RV32ZFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZFH-NEXT: sltu a2, a0, a3
+; RV32ZFH-NEXT: sltu a2, a1, a0
; RV32ZFH-NEXT: addi a2, a2, -1
; RV32ZFH-NEXT: and a2, a2, a3
; RV32ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1710,7 +1710,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
; RV64ZFH-NEXT: sub a3, a0, a1
; RV64ZFH-NEXT: slli a2, a2, 52
; RV64ZFH-NEXT: fmv.d.x fa5, a2
-; RV64ZFH-NEXT: sltu a2, a0, a3
+; RV64ZFH-NEXT: sltu a2, a1, a0
; RV64ZFH-NEXT: addi a2, a2, -1
; RV64ZFH-NEXT: and a2, a2, a3
; RV64ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 20f397b694180..f8293f6c671f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -1195,7 +1195,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: sub a5, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: sltu a3, a0, a5
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a5, a3, a5
; CHECK-NEXT: li a3, 1086
@@ -1228,7 +1228,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1252,7 +1252,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; CHECK-NEXT: fsrmi a4, 1
; CHECK-NEXT: li a2, 52
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: sltu a5, a0, a3
+; CHECK-NEXT: sltu a5, a1, a0
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a5, a5, a3
; CHECK-NEXT: li a3, 1086
@@ -1280,7 +1280,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2402,7 +2402,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a4
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a4, a2, a4
; CHECK-NEXT: li a2, 52
@@ -2433,7 +2433,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2456,7 +2456,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: fsrmi a3, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a4, a4, a2
; CHECK-NEXT: li a2, 52
@@ -2482,7 +2482,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 1bbefc65d3e39..d16418f57033a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1990,7 +1990,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: sltu a2, a0, a3
+; RV32-NEXT: sltu a2, a1, a0
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2117,10 +2117,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sub a6, a0, a1
+; RV64-NEXT: sltu a1, a1, a0
+; RV64-NEXT: li a0, 56
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: and a1, a1, a6
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -2129,11 +2134,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a1
; RV64-NEXT: vand.vx v24, v24, a2, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
; RV64-NEXT: vand.vx v24, v8, a3, v0.t
@@ -2144,9 +2144,9 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: vadd.vv v8, v8, v24, v0.t
; RV64-NEXT: vand.vx v8, v8, a4, v0.t
; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a2, v0.t
; RV64-NEXT: vsub.vv v16, v16, v24, v0.t
@@ -2158,7 +2158,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: vadd.vv v16, v16, v24, v0.t
; RV64-NEXT: vand.vx v16, v16, a4, v0.t
; RV64-NEXT: vmul.vx v16, v16, a5, v0.t
-; RV64-NEXT: vsrl.vx v16, v16, a6, v0.t
+; RV64-NEXT: vsrl.vx v16, v16, a0, v0.t
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64:
@@ -2169,7 +2169,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2200,10 +2200,10 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: sub a4, a0, a1
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: sltu a2, a0, a4
+; RV32-NEXT: sltu a2, a1, a0
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a4
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2308,10 +2308,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV64-NEXT: lui a4, 209715
; RV64-NEXT: lui a5, 61681
; RV64-NEXT: lui a6, 4112
+; RV64-NEXT: sub a7, a0, a2
+; RV64-NEXT: sltu a0, a2, a0
+; RV64-NEXT: li a2, 56
; RV64-NEXT: addi a3, a3, 1365
; RV64-NEXT: addi a4, a4, 819
; RV64-NEXT: addi a5, a5, -241
; RV64-NEXT: addi a6, a6, 257
+; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: and a0, a0, a7
; RV64-NEXT: slli a7, a3, 32
; RV64-NEXT: add a3, a3, a7
; RV64-NEXT: slli a7, a4, 32
@@ -2320,11 +2325,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV64-NEXT: add a5, a5, a7
; RV64-NEXT: slli a7, a6, 32
; RV64-NEXT: add a6, a6, a7
-; RV64-NEXT: li a7, 56
-; RV64-NEXT: sub a2, a0, a2
-; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: vand.vx v24, v24, a3
; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -2346,26 +2346,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a5
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a5
+; RV64-NEXT: vmul.vx v16, v16, a6
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v8, v8, a7
+; RV64-NEXT: vsrl.vx v8, v8, a2
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v16, v16, a6
-; RV64-NEXT: vsrl.vx v16, v16, a7
+; RV64-NEXT: vsrl.vx v16, v16, a2
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index c82ad17545a6a..464c4d1f5f899 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2154,7 +2154,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: sub a3, a0, a1
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: sltu a2, a0, a3
+; RV32-NEXT: sltu a2, a1, a0
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: lui a3, 349525
@@ -2190,31 +2190,31 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: vand.vv v8, v8, v24, v0.t
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi a3, a3, -241
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB46_2
@@ -2226,11 +2226,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vi v8, v16, -1, v0.t
+; RV32-NEXT: vnot.v v16, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
@@ -2286,11 +2286,14 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: lui a5, 4112
; RV64-NEXT: srli a6, a1, 3
; RV64-NEXT: sub a7, a0, a1
+; RV64-NEXT: vslidedown.vx v0, v0, a6
+; RV64-NEXT: sltu a6, a1, a0
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi t0, a5, 257
-; RV64-NEXT: vslidedown.vx v0, v0, a6
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: and a7, a6, a7
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a6, a2, a6
; RV64-NEXT: slli a5, a3, 32
@@ -2299,9 +2302,6 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: add a2, a4, a2
; RV64-NEXT: slli a3, t0, 32
; RV64-NEXT: add a3, t0, a3
-; RV64-NEXT: sltu a4, a0, a7
-; RV64-NEXT: addi a4, a4, -1
-; RV64-NEXT: and a7, a4, a7
; RV64-NEXT: li a4, 56
; RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1, v0.t
@@ -2350,7 +2350,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2381,10 +2381,10 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: sub a4, a0, a1
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: sltu a2, a0, a4
+; RV32-NEXT: sltu a2, a1, a0
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a4
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2489,21 +2489,21 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
; RV64-NEXT: sub a6, a0, a1
+; RV64-NEXT: sltu a7, a1, a0
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
-; RV64-NEXT: addi a7, a4, -241
-; RV64-NEXT: addi t0, a5, 257
+; RV64-NEXT: addi t0, a4, -241
+; RV64-NEXT: addi t1, a5, 257
+; RV64-NEXT: addi a7, a7, -1
+; RV64-NEXT: and a6, a7, a6
; RV64-NEXT: slli a5, a2, 32
; RV64-NEXT: add a5, a2, a5
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a4, a3, a4
-; RV64-NEXT: slli a2, a7, 32
-; RV64-NEXT: add a2, a7, a2
-; RV64-NEXT: slli a3, t0, 32
-; RV64-NEXT: add a3, t0, a3
-; RV64-NEXT: sltu a7, a0, a6
-; RV64-NEXT: addi a7, a7, -1
-; RV64-NEXT: and a6, a7, a6
+; RV64-NEXT: slli a2, t0, 32
+; RV64-NEXT: add a2, t0, a2
+; RV64-NEXT: slli a3, t1, 32
+; RV64-NEXT: add a3, t1, a3
; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1
; RV64-NEXT: vnot.v v16, v16
@@ -2547,7 +2547,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -3731,7 +3731,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a4
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a4, a2, a4
; CHECK-NEXT: li a2, 52
@@ -3766,7 +3766,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -3789,7 +3789,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: fsrmi a3, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a4, a4, a2
; CHECK-NEXT: li a2, 52
@@ -3819,7 +3819,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
index fa81e1f6f3514..912a63b09f1a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
@@ -392,10 +392,10 @@ define <32 x i64> @vp_abs_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl)
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v8, 0, v0.t
; CHECK-NEXT: vmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v16, 0, v0.t
@@ -417,10 +417,10 @@ define <32 x i64> @vp_abs_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v24
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v16, 0
; CHECK-NEXT: vmax.vv v16, v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index f436bbb9a66ca..8e322b64ef551 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -2386,10 +2386,10 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
; CHECK-NEXT: vsrl.vi v24, v8, 8, v0.t
; CHECK-NEXT: lui a1, 1
; CHECK-NEXT: lui a2, 3
-; CHECK-NEXT: addi a3, a0, -64
-; CHECK-NEXT: sltu a0, a0, a3
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a3, a0, a3
+; CHECK-NEXT: sltiu a3, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a3, a3, a0
; CHECK-NEXT: lui a0, 5
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
; CHECK-NEXT: addi a1, a1, -241
@@ -2450,10 +2450,10 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext
; CHECK-NEXT: vsll.vi v8, v8, 8
; CHECK-NEXT: lui a2, 1
; CHECK-NEXT: lui a3, 3
-; CHECK-NEXT: addi a4, a0, -64
-; CHECK-NEXT: sltu a0, a0, a4
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a4
+; CHECK-NEXT: sltiu a4, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a4, a4
+; CHECK-NEXT: and a0, a4, a0
; CHECK-NEXT: lui a4, 5
; CHECK-NEXT: vor.vv v8, v8, v24
; CHECK-NEXT: addi a2, a2, -241
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index eca94ccb9bf7f..c1c9e581decf8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -1275,10 +1275,10 @@ define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext
; CHECK-NEXT: vsrl.vi v24, v8, 8, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a1, a0, -64
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vsrl.vi v24, v16, 8, v0.t
@@ -1302,10 +1302,10 @@ define <128 x i16> @vp_bswap_v128i16_unmasked(<128 x i16> %va, i32 zeroext %evl)
; CHECK-NEXT: vsrl.vi v24, v8, 8
; CHECK-NEXT: vsll.vi v8, v8, 8
; CHECK-NEXT: vor.vv v8, v8, v24
-; CHECK-NEXT: addi a1, a0, -64
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vsrl.vi v24, v16, 8
; CHECK-NEXT: vsll.vi v16, v16, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 466d5d4b8e80a..b58de7abf0442 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 3
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 3
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 3
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 3
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 3
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 3
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 00c36cb7f7327..d1fadc962c2eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1979,10 +1979,10 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
@@ -2065,22 +2065,22 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
-; RV64-NEXT: addi a6, a4, -241
-; RV64-NEXT: addi a7, a5, 257
+; RV64-NEXT: addi a7, a4, -241
+; RV64-NEXT: addi t0, a5, 257
+; RV64-NEXT: neg a4, a6
+; RV64-NEXT: and a6, a4, a0
; RV64-NEXT: slli a5, a2, 32
; RV64-NEXT: add a5, a2, a5
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a4, a3, a4
-; RV64-NEXT: slli a2, a6, 32
-; RV64-NEXT: add a2, a6, a2
-; RV64-NEXT: slli a3, a7, 32
-; RV64-NEXT: add a3, a7, a3
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
+; RV64-NEXT: slli a2, a7, 32
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: slli a3, t0, 32
+; RV64-NEXT: add a3, t0, a3
; RV64-NEXT: li a0, 56
; RV64-NEXT: vor.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 2, v0.t
@@ -2150,9 +2150,9 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: li a2, 32
@@ -2160,110 +2160,102 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsrl.vi v0, v8, 2
; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 1
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vsrl.vi v0, v8, 4
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 16
+; RV32-NEXT: vsrl.vi v0, v8, 8
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 4
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vsrl.vi v0, v8, 16
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vx v0, v16, a2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
-; RV32-NEXT: vor.vv v24, v16, v8
+; RV32-NEXT: vnot.v v0, v16
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v0, v0, v8
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v24, v24
-; RV32-NEXT: vsrl.vi v0, v24, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v0, v0, v16
-; RV32-NEXT: vsub.vv v24, v24, v0
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
+; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v8, v24, v8
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -2285,95 +2277,95 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: lui a4, 209715
; RV64-NEXT: lui a5, 61681
; RV64-NEXT: lui a6, 4112
-; RV64-NEXT: addi a7, a3, 1365
-; RV64-NEXT: addi a3, a4, 819
-; RV64-NEXT: addi a4, a5, -241
-; RV64-NEXT: addi a6, a6, 257
-; RV64-NEXT: slli a5, a7, 32
-; RV64-NEXT: add a7, a7, a5
-; RV64-NEXT: slli a5, a3, 32
-; RV64-NEXT: add a5, a3, a5
-; RV64-NEXT: slli a3, a4, 32
-; RV64-NEXT: add a3, a4, a3
-; RV64-NEXT: slli a4, a6, 32
-; RV64-NEXT: add a4, a6, a4
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
-; RV64-NEXT: li a0, 56
+; RV64-NEXT: sltiu a7, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: addi a3, a3, 1365
+; RV64-NEXT: addi a4, a4, 819
+; RV64-NEXT: addi a5, a5, -241
+; RV64-NEXT: addi t0, a6, 257
+; RV64-NEXT: neg a6, a7
+; RV64-NEXT: and a0, a6, a0
+; RV64-NEXT: slli a6, a3, 32
+; RV64-NEXT: add a7, a3, a6
+; RV64-NEXT: slli a6, a4, 32
+; RV64-NEXT: add a6, a4, a6
+; RV64-NEXT: slli a3, a5, 32
+; RV64-NEXT: add a3, a5, a3
+; RV64-NEXT: slli a4, t0, 32
+; RV64-NEXT: add a4, t0, a4
+; RV64-NEXT: li a5, 56
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 2
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 8
+; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 16
+; RV64-NEXT: vsrl.vi v24, v8, 8
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vsrl.vi v24, v8, 16
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 8
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vor.vv v8, v8, v24
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 16
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vnot.v v8, v8
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vor.vv v16, v16, v24
+; RV64-NEXT: vsrl.vx v24, v16, a2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v16, a2
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a5
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v8, v8, a5
+; RV64-NEXT: vand.vx v24, v8, a6
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a6
; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v16, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v16, a6
; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vx v16, v16, a6
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a5
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v8, v8, a4
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vmul.vx v8, v8, a4
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v8, v8, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a3
; RV64-NEXT: vmul.vx v16, v16, a4
-; RV64-NEXT: vsrl.vx v16, v16, a0
+; RV64-NEXT: vsrl.vx v16, v16, a5
; RV64-NEXT: ret
%v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x i64> %v
@@ -4354,10 +4346,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
@@ -4440,22 +4432,22 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
-; RV64-NEXT: addi a6, a4, -241
-; RV64-NEXT: addi a7, a5, 257
+; RV64-NEXT: addi a7, a4, -241
+; RV64-NEXT: addi t0, a5, 257
+; RV64-NEXT: neg a4, a6
+; RV64-NEXT: and a6, a4, a0
; RV64-NEXT: slli a5, a2, 32
; RV64-NEXT: add a5, a2, a5
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a4, a3, a4
-; RV64-NEXT: slli a2, a6, 32
-; RV64-NEXT: add a2, a6, a2
-; RV64-NEXT: slli a3, a7, 32
-; RV64-NEXT: add a3, a7, a3
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
+; RV64-NEXT: slli a2, a7, 32
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: slli a3, t0, 32
+; RV64-NEXT: add a3, t0, a3
; RV64-NEXT: li a0, 56
; RV64-NEXT: vor.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 2, v0.t
@@ -4525,9 +4517,9 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: li a2, 32
@@ -4535,110 +4527,102 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsrl.vi v0, v8, 2
; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 1
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vsrl.vi v0, v8, 4
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 16
+; RV32-NEXT: vsrl.vi v0, v8, 8
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 4
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vsrl.vi v0, v8, 16
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vx v0, v16, a2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
-; RV32-NEXT: vor.vv v24, v16, v8
+; RV32-NEXT: vnot.v v0, v16
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v0, v0, v8
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v24, v24
-; RV32-NEXT: vsrl.vi v0, v24, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v0, v0, v16
-; RV32-NEXT: vsub.vv v24, v24, v0
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
+; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v8, v24, v8
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -4660,95 +4644,95 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV64-NEXT: lui a4, 209715
; RV64-NEXT: lui a5, 61681
; RV64-NEXT: lui a6, 4112
-; RV64-NEXT: addi a7, a3, 1365
-; RV64-NEXT: addi a3, a4, 819
-; RV64-NEXT: addi a4, a5, -241
-; RV64-NEXT: addi a6, a6, 257
-; RV64-NEXT: slli a5, a7, 32
-; RV64-NEXT: add a7, a7, a5
-; RV64-NEXT: slli a5, a3, 32
-; RV64-NEXT: add a5, a3, a5
-; RV64-NEXT: slli a3, a4, 32
-; RV64-NEXT: add a3, a4, a3
-; RV64-NEXT: slli a4, a6, 32
-; RV64-NEXT: add a4, a6, a4
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
-; RV64-NEXT: li a0, 56
+; RV64-NEXT: sltiu a7, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: addi a3, a3, 1365
+; RV64-NEXT: addi a4, a4, 819
+; RV64-NEXT: addi a5, a5, -241
+; RV64-NEXT: addi t0, a6, 257
+; RV64-NEXT: neg a6, a7
+; RV64-NEXT: and a0, a6, a0
+; RV64-NEXT: slli a6, a3, 32
+; RV64-NEXT: add a7, a3, a6
+; RV64-NEXT: slli a6, a4, 32
+; RV64-NEXT: add a6, a4, a6
+; RV64-NEXT: slli a3, a5, 32
+; RV64-NEXT: add a3, a5, a3
+; RV64-NEXT: slli a4, t0, 32
+; RV64-NEXT: add a4, t0, a4
+; RV64-NEXT: li a5, 56
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 2
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 8
+; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 16
+; RV64-NEXT: vsrl.vi v24, v8, 8
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vsrl.vi v24, v8, 16
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 8
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vor.vv v8, v8, v24
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 16
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vnot.v v8, v8
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vor.vv v16, v16, v24
+; RV64-NEXT: vsrl.vx v24, v16, a2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v16, a2
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a5
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v8, v8, a5
+; RV64-NEXT: vand.vx v24, v8, a6
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a6
; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v16, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v16, a6
; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vx v16, v16, a6
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a5
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v8, v8, a4
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vmul.vx v8, v8, a4
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v8, v8, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a3
; RV64-NEXT: vmul.vx v16, v16, a4
-; RV64-NEXT: vsrl.vx v16, v16, a0
+; RV64-NEXT: vsrl.vx v16, v16, a5
; RV64-NEXT: ret
%v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index f56438bf87e6a..61bc86333d95f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1430,7 +1430,10 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v24, v0.t
@@ -1455,24 +1458,21 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: addi a2, sp, 16
; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
@@ -1481,15 +1481,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v8, v24, v8, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
@@ -1504,15 +1507,12 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -1541,10 +1541,14 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV64-NEXT: lui a2, 209715
; RV64-NEXT: lui a3, 61681
; RV64-NEXT: lui a4, 4112
+; RV64-NEXT: sltiu a5, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a1, a1, 1365
; RV64-NEXT: addi a2, a2, 819
; RV64-NEXT: addi a3, a3, -241
; RV64-NEXT: addi a4, a4, 257
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a0, a5, a0
; RV64-NEXT: slli a5, a1, 32
; RV64-NEXT: add a1, a1, a5
; RV64-NEXT: slli a5, a2, 32
@@ -1553,10 +1557,6 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV64-NEXT: add a3, a3, a5
; RV64-NEXT: slli a5, a4, 32
; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: addi a5, a0, -16
-; RV64-NEXT: sltu a0, a0, a5
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a5
; RV64-NEXT: li a5, 56
; RV64-NEXT: vand.vx v24, v24, a1, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
@@ -1603,10 +1603,10 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1628,13 +1628,13 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vand.vv v24, v16, v0
; RV32-NEXT: vsrl.vi v16, v16, 2
; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v24, v16
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
@@ -1672,10 +1672,14 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a0, a6, a0
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -1684,10 +1688,6 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
; RV64-NEXT: li a6, 56
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vsub.vv v8, v8, v24
@@ -1710,18 +1710,18 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a4
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a4
+; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsrl.vx v16, v16, a6
; RV64-NEXT: ret
%v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 098384d200045..0e3eadcce484e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1604,10 +1604,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
@@ -1616,26 +1616,26 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT: vadd.vi v16, v8, -1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
@@ -1679,31 +1679,31 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: lui a2, 209715
; RV64-NEXT: lui a3, 61681
; RV64-NEXT: lui a4, 4112
+; RV64-NEXT: sltiu a5, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a1, a1, 1365
; RV64-NEXT: addi a2, a2, 819
-; RV64-NEXT: addi a5, a3, -241
+; RV64-NEXT: addi a3, a3, -241
; RV64-NEXT: addi a4, a4, 257
-; RV64-NEXT: slli a3, a1, 32
-; RV64-NEXT: add a6, a1, a3
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a3, a2, a3
-; RV64-NEXT: slli a1, a5, 32
-; RV64-NEXT: add a1, a5, a1
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a5, a5, a0
+; RV64-NEXT: slli a0, a1, 32
+; RV64-NEXT: add a6, a1, a0
+; RV64-NEXT: slli a0, a2, 32
+; RV64-NEXT: add a7, a2, a0
+; RV64-NEXT: slli a1, a3, 32
+; RV64-NEXT: add a1, a3, a1
; RV64-NEXT: slli a2, a4, 32
; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: addi a4, a0, -16
-; RV64-NEXT: sltu a0, a0, a4
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a4, a0, a4
; RV64-NEXT: li a0, 56
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT: vand.vx v24, v8, a3, v0.t
+; RV64-NEXT: vand.vx v24, v8, a7, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: vand.vx v8, v8, a7, v0.t
; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v24, v0.t
@@ -1711,16 +1711,16 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: vmul.vx v8, v8, a2, v0.t
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a5, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1, v0.t
; RV64-NEXT: vnot.v v16, v16, v0.t
; RV64-NEXT: vand.vv v16, v16, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT: vand.vx v24, v16, a3, v0.t
+; RV64-NEXT: vand.vx v24, v16, a7, v0.t
; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT: vand.vx v16, v16, a3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a7, v0.t
; RV64-NEXT: vadd.vv v16, v24, v16, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV64-NEXT: vadd.vv v16, v16, v24, v0.t
@@ -1744,9 +1744,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vadd.vi v24, v8, -1
; RV32-NEXT: vnot.v v0, v8
@@ -1754,15 +1754,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v0, v24
@@ -1774,8 +1769,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vadd.vi v0, v16, -1
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v0, v16, v0
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1783,16 +1780,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v0, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 4
@@ -1826,7 +1816,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -1848,10 +1838,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a0, a6, a0
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -1860,47 +1854,43 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
; RV64-NEXT: li a6, 56
; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vand.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a3
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vx v24, v24, a2
+; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a3
+; RV64-NEXT: vadd.vv v8, v24, v8
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vand.vx v24, v16, a3
; RV64-NEXT: vsrl.vi v16, v16, 2
; RV64-NEXT: vand.vx v16, v16, a3
+; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a4
; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsrl.vx v16, v16, a6
@@ -3509,10 +3499,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
@@ -3521,26 +3511,26 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT: vadd.vi v16, v8, -1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
@@ -3584,31 +3574,31 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: lui a2, 209715
; RV64-NEXT: lui a3, 61681
; RV64-NEXT: lui a4, 4112
+; RV64-NEXT: sltiu a5, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a1, a1, 1365
; RV64-NEXT: addi a2, a2, 819
-; RV64-NEXT: addi a5, a3, -241
+; RV64-NEXT: addi a3, a3, -241
; RV64-NEXT: addi a4, a4, 257
-; RV64-NEXT: slli a3, a1, 32
-; RV64-NEXT: add a6, a1, a3
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a3, a2, a3
-; RV64-NEXT: slli a1, a5, 32
-; RV64-NEXT: add a1, a5, a1
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a5, a5, a0
+; RV64-NEXT: slli a0, a1, 32
+; RV64-NEXT: add a6, a1, a0
+; RV64-NEXT: slli a0, a2, 32
+; RV64-NEXT: add a7, a2, a0
+; RV64-NEXT: slli a1, a3, 32
+; RV64-NEXT: add a1, a3, a1
; RV64-NEXT: slli a2, a4, 32
; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: addi a4, a0, -16
-; RV64-NEXT: sltu a0, a0, a4
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a4, a0, a4
; RV64-NEXT: li a0, 56
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT: vand.vx v24, v8, a3, v0.t
+; RV64-NEXT: vand.vx v24, v8, a7, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: vand.vx v8, v8, a7, v0.t
; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v24, v0.t
@@ -3616,16 +3606,16 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: vmul.vx v8, v8, a2, v0.t
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a5, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1, v0.t
; RV64-NEXT: vnot.v v16, v16, v0.t
; RV64-NEXT: vand.vv v16, v16, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT: vand.vx v24, v16, a3, v0.t
+; RV64-NEXT: vand.vx v24, v16, a7, v0.t
; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT: vand.vx v16, v16, a3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a7, v0.t
; RV64-NEXT: vadd.vv v16, v24, v16, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV64-NEXT: vadd.vv v16, v16, v24, v0.t
@@ -3649,9 +3639,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vadd.vi v24, v8, -1
; RV32-NEXT: vnot.v v0, v8
@@ -3659,15 +3649,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v0, v24
@@ -3679,8 +3664,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vadd.vi v0, v16, -1
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v0, v16, v0
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -3688,16 +3675,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v0, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 4
@@ -3731,7 +3711,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -3753,10 +3733,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a0, a6, a0
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -3765,47 +3749,43 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
; RV64-NEXT: li a6, 56
; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vand.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a3
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vx v24, v24, a2
+; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a3
+; RV64-NEXT: vadd.vv v8, v24, v8
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vand.vx v24, v16, a3
; RV64-NEXT: vsrl.vi v16, v16, 2
; RV64-NEXT: vand.vx v16, v16, a3
+; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a4
; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsrl.vx v16, v16, a6
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 76f5f0a32bd1c..5a0749068b41d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 2
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 2
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 2
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 2
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 2
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 2
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 2
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 2
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index da6e2fae93687..ad7ee735707f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
}
define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmax_vv_v32f64_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: bltu a2, a1, .LBB25_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
-; CHECK-NEXT: .LBB25_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: vfmax.vv v16, v16, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%v = call <32 x double> @llvm.vp.maximum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x double> %v
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index e179970199171..9a5304e0d94e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
}
define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmin_vv_v32f64_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: bltu a2, a1, .LBB25_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
-; CHECK-NEXT: .LBB25_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: vfmin.vv v16, v16, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%v = call <32 x double> @llvm.vp.minimum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x double> %v
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
index 465b166826a37..6d87ecfd3bc6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
@@ -96,10 +96,10 @@ define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
; CHECK-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vmv1r.v v0, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
index 96eda109e1c70..044b9fefa1220 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
index 4020100bf364b..55f4d9e0805c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
index e509722b623a2..aab5bbdfebacd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
@@ -97,10 +97,10 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
; CHECK-NEXT: vfncvt.f.f.w v8, v24, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vfncvt.f.f.w v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 57c94830fc606..e3ed908a5bddb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -741,10 +741,10 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vfabs.v v24, v8, v0.t
; RV32-NEXT: lui a1, %hi(.LCPI26_0)
; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT: addi a1, a0, -16
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: sltiu a1, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a0, a1, a0
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32-NEXT: frflags a1
@@ -787,12 +787,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV64-NEXT: li a1, 1075
; RV64-NEXT: slli a1, a1, 52
; RV64-NEXT: fmv.d.x fa5, a1
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a1, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a1, a1
+; RV64-NEXT: and a0, a1, a0
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT: and a0, a0, a1
; RV64-NEXT: frflags a1
; RV64-NEXT: vmv1r.v v0, v6
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -832,10 +832,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV32-NEXT: vfabs.v v24, v8
; RV32-NEXT: lui a2, %hi(.LCPI27_0)
; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: frflags a2
; RV32-NEXT: vmflt.vf v0, v24, fa5
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -870,10 +870,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV64-NEXT: li a2, 1075
; RV64-NEXT: slli a2, a2, 52
; RV64-NEXT: fmv.d.x fa5, a2
-; RV64-NEXT: addi a2, a0, -16
-; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: sltiu a2, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a2, a2
+; RV64-NEXT: and a0, a2, a0
; RV64-NEXT: frflags a2
; RV64-NEXT: vmflt.vf v0, v24, fa5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index ca9b24e60e503..4e90727b6ebf1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -123,10 +123,10 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32
; CHECK-NEXT: vfmv.s.f v25, fa0
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vfredusum.vs v25, v8, v25, v0.t
-; CHECK-NEXT: addi a1, a0, -32
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 33
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -32
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vfredusum.vs v25, v16, v25, v0.t
@@ -151,10 +151,10 @@ define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m,
; CHECK-NEXT: vfmv.s.f v25, fa0
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
-; CHECK-NEXT: addi a1, a0, -32
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 33
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -32
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vfredosum.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 3e77020ed0213..27211f153b526 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -654,12 +654,12 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
; CHECK-NEXT: .LBB49_2:
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vmv.s.x v25, a0
-; CHECK-NEXT: addi a0, a1, -32
+; CHECK-NEXT: sltiu a0, a1, 33
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
-; CHECK-NEXT: sltu a1, a1, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a1, a1, -32
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vredxor.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
index 8523ca957a8f5..b5cd2e783ff66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
@@ -211,15 +211,15 @@ define zeroext i1 @vpreduce_and_v256i1(i1 zeroext %s, <256 x i1> %v, <256 x i1>
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: .LBB14_2:
; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: sltiu a3, a1, 129
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a1, a3, a1
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmnot.m v9, v9
; CHECK-NEXT: vcpop.m a2, v9, v0.t
; CHECK-NEXT: seqz a2, a2
; CHECK-NEXT: and a0, a2, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmnot.m v8, v8
; CHECK-NEXT: vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 7540495c0d3b5..41e8d1f982e32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -669,12 +669,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV32-NEXT: vfabs.v v24, v8, v0.t
; RV32-NEXT: lui a1, %hi(.LCPI26_0)
; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT: addi a1, a0, -16
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: sltiu a1, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a1, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: and a0, a1, a0
; RV32-NEXT: vmv1r.v v0, v6
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t
@@ -711,12 +711,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV64-NEXT: li a1, 1075
; RV64-NEXT: slli a1, a1, 52
; RV64-NEXT: fmv.d.x fa5, a1
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a1, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: and a0, a1, a0
; RV64-NEXT: vmv1r.v v0, v6
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t
@@ -752,10 +752,10 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV32-NEXT: vfabs.v v24, v8
; RV32-NEXT: lui a2, %hi(.LCPI27_0)
; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmflt.vf v0, v24, fa5
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vfabs.v v24, v16
@@ -786,11 +786,11 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV64-NEXT: li a2, 1075
; RV64-NEXT: slli a2, a2, 52
; RV64-NEXT: fmv.d.x fa5, a2
-; RV64-NEXT: addi a2, a0, -16
-; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a2, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a2, a2
+; RV64-NEXT: and a0, a2, a0
; RV64-NEXT: vmflt.vf v0, v24, fa5
-; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vfabs.v v24, v16
; RV64-NEXT: vmflt.vf v7, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index de5427f329496..2d4941744292e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 4
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 4
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 4
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 4
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 4
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 4
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 4
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 4
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 1c923e3f12171..45ea933f427ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 0
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 0
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 0
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 0
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 0
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 0
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 0
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 0
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 83cbd2b760341..3dc45f97e6964 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 1
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 1
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 1
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 1
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 1
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 1
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 1
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 1
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index af3e9db9fe123..79f1b88a765b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1076,10 +1076,10 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFH-NEXT: .LBB43_2:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v6, v8, v24, v0.t
-; ZVFH-NEXT: addi a0, a2, -64
-; ZVFH-NEXT: sltu a1, a2, a0
-; ZVFH-NEXT: addi a1, a1, -1
-; ZVFH-NEXT: and a0, a1, a0
+; ZVFH-NEXT: sltiu a0, a2, 65
+; ZVFH-NEXT: neg a0, a0
+; ZVFH-NEXT: addi a1, a2, -64
+; ZVFH-NEXT: and a0, a0, a1
; ZVFH-NEXT: vmv1r.v v0, v7
; ZVFH-NEXT: addi a1, sp, 16
; ZVFH-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -3929,10 +3929,10 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
; CHECK-NEXT: .LBB87_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v6, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index efc0f7ef4a441..9f354d160d7c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -598,13 +598,13 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
; CHECK-NEXT: addi a4, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a2)
-; CHECK-NEXT: addi a2, a3, -128
+; CHECK-NEXT: sltiu a2, a3, 129
; CHECK-NEXT: vle8.v v24, (a4)
-; CHECK-NEXT: sltu a4, a3, a2
+; CHECK-NEXT: addi a4, a3, -128
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a2, a4, a2
-; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: neg a0, a2
+; CHECK-NEXT: and a0, a0, a4
+; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vv v6, v16, v24, v0.t
; CHECK-NEXT: bltu a3, a1, .LBB51_2
; CHECK-NEXT: # %bb.1:
@@ -636,10 +636,10 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB52_2
@@ -666,10 +666,10 @@ define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m,
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB53_2
@@ -1250,10 +1250,10 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
; CHECK-NEXT: .LBB99_2:
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vv v6, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -32
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 33
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -32
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -1286,10 +1286,10 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze
; CHECK-NEXT: .LBB100_2:
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT: addi a2, a1, -32
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 33
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -32
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t
@@ -1316,10 +1316,10 @@ define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i
; CHECK-NEXT: .LBB101_2:
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT: addi a2, a1, -32
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 33
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -32
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
index a452e5a9ffbb8..9a08596ebb473 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vsext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
; CHECK-NEXT: .LBB12_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsext.vf2 v16, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vsext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsext.vf2 v24, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
index afa8f2fda2ed4..8202ba4e2d815 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vsitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vsitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 8af4ced77be39..45c106240efc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -487,25 +487,24 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: li a4, 16
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB45_2
+; CHECK-NEXT: sltiu a3, a2, 17
+; CHECK-NEXT: addi a4, a2, -16
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: li a5, 16
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: bltu a2, a5, .LBB45_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: li a2, 16
; CHECK-NEXT: .LBB45_2:
-; CHECK-NEXT: mul a4, a3, a1
-; CHECK-NEXT: addi a5, a2, -16
+; CHECK-NEXT: mul a4, a2, a1
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v9, 2
; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sltu a2, a2, a5
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a5
; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vlse64.v v16, (a4), a1, v0.t
; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
; CHECK-NEXT: ret
%load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %m, i32 %evl)
@@ -515,21 +514,20 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) nounwind {
; CHECK-LABEL: strided_vpload_v32f64_allones_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a4, 16
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB46_2
+; CHECK-NEXT: sltiu a3, a2, 17
+; CHECK-NEXT: addi a4, a2, -16
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: li a5, 16
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: bltu a2, a5, .LBB46_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: li a2, 16
; CHECK-NEXT: .LBB46_2:
-; CHECK-NEXT: mul a4, a3, a1
-; CHECK-NEXT: addi a5, a2, -16
+; CHECK-NEXT: mul a4, a2, a1
; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sltu a2, a2, a5
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a5
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vlse64.v v16, (a4), a1
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vlse64.v v16, (a4), a1
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vlse64.v v8, (a0), a1
; CHECK-NEXT: ret
%load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> splat (i1 true), i32 %evl)
@@ -549,10 +547,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: li a3, 32
; CHECK-RV32-NEXT: .LBB47_2:
; CHECK-RV32-NEXT: mul a6, a3, a2
-; CHECK-RV32-NEXT: addi a5, a4, -32
-; CHECK-RV32-NEXT: sltu a7, a4, a5
-; CHECK-RV32-NEXT: addi a7, a7, -1
-; CHECK-RV32-NEXT: and a7, a7, a5
+; CHECK-RV32-NEXT: sltiu a5, a4, 33
+; CHECK-RV32-NEXT: addi a7, a4, -32
+; CHECK-RV32-NEXT: neg a5, a5
+; CHECK-RV32-NEXT: and a7, a5, a7
; CHECK-RV32-NEXT: li a5, 16
; CHECK-RV32-NEXT: add a6, a1, a6
; CHECK-RV32-NEXT: bltu a7, a5, .LBB47_4
@@ -563,10 +561,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4
; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV32-NEXT: addi a6, a3, -16
-; CHECK-RV32-NEXT: sltu a3, a3, a6
-; CHECK-RV32-NEXT: addi a3, a3, -1
-; CHECK-RV32-NEXT: and a3, a3, a6
+; CHECK-RV32-NEXT: sltiu a6, a3, 17
+; CHECK-RV32-NEXT: neg a6, a6
+; CHECK-RV32-NEXT: addi a3, a3, -16
+; CHECK-RV32-NEXT: and a3, a6, a3
; CHECK-RV32-NEXT: bltu a4, a5, .LBB47_6
; CHECK-RV32-NEXT: # %bb.5:
; CHECK-RV32-NEXT: li a4, 16
@@ -600,10 +598,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: li a4, 32
; CHECK-RV64-NEXT: .LBB47_2:
; CHECK-RV64-NEXT: mul a6, a4, a2
-; CHECK-RV64-NEXT: addi a5, a3, -32
-; CHECK-RV64-NEXT: sltu a7, a3, a5
-; CHECK-RV64-NEXT: addi a7, a7, -1
-; CHECK-RV64-NEXT: and a7, a7, a5
+; CHECK-RV64-NEXT: sltiu a5, a3, 33
+; CHECK-RV64-NEXT: addi a7, a3, -32
+; CHECK-RV64-NEXT: neg a5, a5
+; CHECK-RV64-NEXT: and a7, a5, a7
; CHECK-RV64-NEXT: li a5, 16
; CHECK-RV64-NEXT: add a6, a1, a6
; CHECK-RV64-NEXT: bltu a7, a5, .LBB47_4
@@ -614,10 +612,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4
; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-RV64-NEXT: vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV64-NEXT: addi a6, a4, -16
-; CHECK-RV64-NEXT: sltu a4, a4, a6
-; CHECK-RV64-NEXT: addi a4, a4, -1
-; CHECK-RV64-NEXT: and a4, a4, a6
+; CHECK-RV64-NEXT: sltiu a6, a4, 17
+; CHECK-RV64-NEXT: neg a6, a6
+; CHECK-RV64-NEXT: addi a4, a4, -16
+; CHECK-RV64-NEXT: and a4, a6, a4
; CHECK-RV64-NEXT: bltu a3, a5, .LBB47_6
; CHECK-RV64-NEXT: # %bb.5:
; CHECK-RV64-NEXT: li a3, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
index 25624ea0fcf6c..c7edae931a126 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
@@ -411,14 +411,14 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid
; CHECK-NEXT: .LBB38_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT: sltiu a4, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
; CHECK-NEXT: mul a3, a3, a1
-; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: addi a3, a2, -16
-; CHECK-NEXT: sltu a2, a2, a3
-; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: neg a4, a4
+; CHECK-NEXT: and a2, a4, a2
+; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v16, (a0), a1, v0.t
; CHECK-NEXT: ret
@@ -437,12 +437,12 @@ define void @strided_store_v32f64_allones_mask(<32 x double> %v, ptr %ptr, i32 s
; CHECK-NEXT: .LBB39_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1
+; CHECK-NEXT: sltiu a4, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
; CHECK-NEXT: mul a3, a3, a1
+; CHECK-NEXT: neg a4, a4
+; CHECK-NEXT: and a2, a4, a2
; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: addi a3, a2, -16
-; CHECK-NEXT: sltu a2, a2, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v16, (a0), a1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index f992d1f8f7eee..f69a4ffde7910 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -56,10 +56,10 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero
; CHECK-NEXT: .LBB4_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT: addi a2, a0, -64
-; CHECK-NEXT: sltu a0, a0, a2
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a2
+; CHECK-NEXT: sltiu a2, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: and a0, a2, a0
; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v24, v16, 0, v0.t
@@ -214,79 +214,85 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV32-NEXT: vmv1r.v v7, v0
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vslidedown.vi v5, v0, 8
; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vi v4, v0, 4
-; RV32-NEXT: addi a2, a7, -64
-; RV32-NEXT: vslidedown.vi v3, v5, 4
-; RV32-NEXT: sltu a3, a7, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a4, a3, a2
-; RV32-NEXT: addi a2, a4, -32
-; RV32-NEXT: sltu a3, a4, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: sltiu a2, a7, 65
+; RV32-NEXT: addi a3, a7, -64
+; RV32-NEXT: neg a4, a2
+; RV32-NEXT: and a4, a4, a3
+; RV32-NEXT: sltiu a2, a4, 33
+; RV32-NEXT: addi a3, a4, -32
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and t1, a2, a3
; RV32-NEXT: li a2, 16
-; RV32-NEXT: addi t0, a3, -16
-; RV32-NEXT: mv a5, a3
-; RV32-NEXT: bltu a3, a2, .LBB16_2
+; RV32-NEXT: vslidedown.vi v3, v5, 4
+; RV32-NEXT: mv a5, t1
+; RV32-NEXT: bltu t1, a2, .LBB16_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a5, 16
; RV32-NEXT: .LBB16_2:
-; RV32-NEXT: li t2, 64
-; RV32-NEXT: sltu t1, a3, t0
+; RV32-NEXT: li t0, 64
+; RV32-NEXT: sltiu a3, t1, 17
; RV32-NEXT: mv a6, a7
-; RV32-NEXT: bltu a7, t2, .LBB16_4
+; RV32-NEXT: bltu a7, t0, .LBB16_4
; RV32-NEXT: # %bb.3:
; RV32-NEXT: li a6, 64
; RV32-NEXT: .LBB16_4:
; RV32-NEXT: addi t2, a1, 128
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v6, v4, 2
-; RV32-NEXT: addi t6, a1, 512
-; RV32-NEXT: addi t5, a1, 640
+; RV32-NEXT: addi t5, a1, 512
+; RV32-NEXT: addi t4, a1, 640
; RV32-NEXT: vslidedown.vi v0, v3, 2
-; RV32-NEXT: addi t1, t1, -1
+; RV32-NEXT: neg t0, a3
+; RV32-NEXT: addi t1, t1, -16
; RV32-NEXT: addi t3, a1, 384
; RV32-NEXT: vslidedown.vi v2, v5, 2
; RV32-NEXT: li a3, 32
-; RV32-NEXT: addi t4, a6, -32
-; RV32-NEXT: sltu a6, a6, t4
-; RV32-NEXT: addi a6, a6, -1
-; RV32-NEXT: and a6, a6, t4
-; RV32-NEXT: addi t4, a6, -16
-; RV32-NEXT: sltu s0, a6, t4
-; RV32-NEXT: addi s0, s0, -1
+; RV32-NEXT: sltiu t6, a6, 33
+; RV32-NEXT: addi a6, a6, -32
+; RV32-NEXT: neg t6, t6
+; RV32-NEXT: and a6, t6, a6
+; RV32-NEXT: sltiu t6, a6, 17
+; RV32-NEXT: neg t6, t6
+; RV32-NEXT: addi s0, a6, -16
; RV32-NEXT: bltu a6, a2, .LBB16_6
; RV32-NEXT: # %bb.5:
; RV32-NEXT: li a6, 16
; RV32-NEXT: .LBB16_6:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle64.v v8, (t6)
-; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: vle64.v v8, (t5)
+; RV32-NEXT: csrr t5, vlenb
; RV32-NEXT: sw a0, 4(sp) # 4-byte Folded Spill
; RV32-NEXT: li a0, 56
-; RV32-NEXT: mul t6, t6, a0
+; RV32-NEXT: mul t5, t5, a0
; RV32-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT: add t6, sp, t6
-; RV32-NEXT: addi t6, t6, 16
-; RV32-NEXT: vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vle64.v v8, (t5)
-; RV32-NEXT: vle64.v v16, (t2)
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 16
+; RV32-NEXT: vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vle64.v v16, (t4)
+; RV32-NEXT: vle64.v v8, (t2)
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: li t4, 40
+; RV32-NEXT: mul t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 16
+; RV32-NEXT: vs8r.v v8, (t2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: csrr t2, vlenb
-; RV32-NEXT: li t5, 48
-; RV32-NEXT: mul t2, t2, t5
+; RV32-NEXT: li t4, 48
+; RV32-NEXT: mul t2, t2, t4
; RV32-NEXT: add t2, sp, t2
; RV32-NEXT: addi t2, t2, 16
; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -296,8 +302,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: add t2, sp, t2
; RV32-NEXT: addi t2, t2, 16
; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: and t2, t1, t0
-; RV32-NEXT: and t1, s0, t4
+; RV32-NEXT: and t2, t0, t1
+; RV32-NEXT: and t1, t6, s0
; RV32-NEXT: addi a1, a1, 256
; RV32-NEXT: mv t0, a4
; RV32-NEXT: bltu a4, a3, .LBB16_8
@@ -305,45 +311,45 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: li t0, 32
; RV32-NEXT: .LBB16_8:
; RV32-NEXT: vsetvli zero, t2, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v24, v8, 0, v0.t
-; RV32-NEXT: csrr t2, vlenb
-; RV32-NEXT: li t3, 24
-; RV32-NEXT: mul t2, t2, t3
-; RV32-NEXT: add t2, sp, t2
-; RV32-NEXT: addi t2, t2, 16
-; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vnsrl.wi v24, v16, 0, v0.t
; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: csrr t2, vlenb
; RV32-NEXT: li t3, 56
; RV32-NEXT: mul t2, t2, t3
; RV32-NEXT: add t2, sp, t2
; RV32-NEXT: addi t2, t2, 16
-; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v8, (t2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v8, v24, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: slli a5, a5, 6
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmv1r.v v0, v6
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: li t2, 40
+; RV32-NEXT: mul a5, a5, t2
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, t1, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: slli a5, a5, 4
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a5, t0, -16
-; RV32-NEXT: sltu t0, t0, a5
-; RV32-NEXT: addi t0, t0, -1
-; RV32-NEXT: and a5, t0, a5
+; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: sltiu a5, t0, 17
+; RV32-NEXT: addi t0, t0, -16
+; RV32-NEXT: neg a5, a5
+; RV32-NEXT: and a5, a5, t0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a1)
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v30, v7, 2
+; RV32-NEXT: vslidedown.vi v28, v7, 2
; RV32-NEXT: vmv1r.v v0, v4
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li t0, 48
@@ -364,9 +370,15 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; RV32-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a1, a1, a5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
; RV32-NEXT: bltu a4, a2, .LBB16_10
; RV32-NEXT: # %bb.9:
; RV32-NEXT: li a4, 16
@@ -375,32 +387,33 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v24, v8, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; RV32-NEXT: mv a1, a7
; RV32-NEXT: bltu a7, a3, .LBB16_12
; RV32-NEXT: # %bb.11:
; RV32-NEXT: li a1, 32
; RV32-NEXT: .LBB16_12:
-; RV32-NEXT: vmv1r.v v0, v30
+; RV32-NEXT: vmv1r.v v0, v28
+; RV32-NEXT: vmv4r.v v8, v24
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
+; RV32-NEXT: li a5, 40
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
@@ -417,7 +430,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a4, a1, -16
+; RV32-NEXT: sltiu a4, a1, 17
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: li a6, 56
; RV32-NEXT: mul a5, a5, a6
@@ -438,7 +451,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: addi a5, a5, 16
; RV32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
+; RV32-NEXT: li a6, 40
; RV32-NEXT: mul a5, a5, a6
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
@@ -450,11 +463,12 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT: sltu a1, a1, a4
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a4
+; RV32-NEXT: neg a4, a4
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: and a1, a4, a1
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
@@ -466,35 +480,34 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: .LBB16_14:
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a7, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v24, v16, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v24, 0, v0.t
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vslideup.vi v24, v8, 16
-; RV32-NEXT: vse32.v v24, (a0)
-; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vslideup.vi v16, v8, 16
+; RV32-NEXT: vse32.v v16, (a0)
+; RV32-NEXT: addi a1, a0, 128
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 48
+; RV32-NEXT: li a3, 56
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
-; RV32-NEXT: addi a1, a0, 128
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 56
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 6
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
-; RV32-NEXT: addi a0, a0, 384
+; RV32-NEXT: addi a0, a0, 256
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: li a2, 48
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
@@ -537,66 +550,66 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: vslidedown.vi v5, v0, 8
; RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64-NEXT: vslidedown.vi v4, v0, 4
-; RV64-NEXT: addi a2, a7, -64
-; RV64-NEXT: vslidedown.vi v3, v5, 4
-; RV64-NEXT: sltu a3, a7, a2
-; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: and a4, a3, a2
-; RV64-NEXT: addi a2, a4, -32
-; RV64-NEXT: sltu a3, a4, a2
-; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: and a3, a3, a2
+; RV64-NEXT: sltiu a2, a7, 65
+; RV64-NEXT: addi a3, a7, -64
+; RV64-NEXT: neg a4, a2
+; RV64-NEXT: and a4, a4, a3
+; RV64-NEXT: sltiu a2, a4, 33
+; RV64-NEXT: addi a3, a4, -32
+; RV64-NEXT: neg a2, a2
+; RV64-NEXT: and t1, a2, a3
; RV64-NEXT: li a2, 16
-; RV64-NEXT: addi t0, a3, -16
-; RV64-NEXT: mv a5, a3
-; RV64-NEXT: bltu a3, a2, .LBB16_2
+; RV64-NEXT: vslidedown.vi v3, v5, 4
+; RV64-NEXT: mv a5, t1
+; RV64-NEXT: bltu t1, a2, .LBB16_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a5, 16
; RV64-NEXT: .LBB16_2:
-; RV64-NEXT: li t2, 64
-; RV64-NEXT: sltu t1, a3, t0
+; RV64-NEXT: li t0, 64
+; RV64-NEXT: sltiu a3, t1, 17
; RV64-NEXT: mv a6, a7
-; RV64-NEXT: bltu a7, t2, .LBB16_4
+; RV64-NEXT: bltu a7, t0, .LBB16_4
; RV64-NEXT: # %bb.3:
; RV64-NEXT: li a6, 64
; RV64-NEXT: .LBB16_4:
; RV64-NEXT: addi t2, a1, 128
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v6, v4, 2
-; RV64-NEXT: addi t6, a1, 512
-; RV64-NEXT: addi t5, a1, 640
+; RV64-NEXT: addi t5, a1, 512
+; RV64-NEXT: addi t4, a1, 640
; RV64-NEXT: vslidedown.vi v0, v3, 2
-; RV64-NEXT: addi t1, t1, -1
+; RV64-NEXT: neg t0, a3
+; RV64-NEXT: addi t1, t1, -16
; RV64-NEXT: addi t3, a1, 384
; RV64-NEXT: vslidedown.vi v2, v5, 2
; RV64-NEXT: li a3, 32
-; RV64-NEXT: addi t4, a6, -32
-; RV64-NEXT: sltu a6, a6, t4
-; RV64-NEXT: addi a6, a6, -1
-; RV64-NEXT: and a6, a6, t4
-; RV64-NEXT: addi t4, a6, -16
-; RV64-NEXT: sltu s0, a6, t4
-; RV64-NEXT: addi s0, s0, -1
+; RV64-NEXT: sltiu t6, a6, 33
+; RV64-NEXT: addi a6, a6, -32
+; RV64-NEXT: neg t6, t6
+; RV64-NEXT: and a6, t6, a6
+; RV64-NEXT: sltiu t6, a6, 17
+; RV64-NEXT: neg t6, t6
+; RV64-NEXT: addi s0, a6, -16
; RV64-NEXT: bltu a6, a2, .LBB16_6
; RV64-NEXT: # %bb.5:
; RV64-NEXT: li a6, 16
; RV64-NEXT: .LBB16_6:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v8, (t6)
-; RV64-NEXT: csrr t6, vlenb
+; RV64-NEXT: vle64.v v8, (t5)
+; RV64-NEXT: csrr t5, vlenb
; RV64-NEXT: sd a0, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a0, 56
-; RV64-NEXT: mul t6, t6, a0
+; RV64-NEXT: mul t5, t5, a0
; RV64-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: add t6, sp, t6
-; RV64-NEXT: addi t6, t6, 32
-; RV64-NEXT: vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vle64.v v8, (t5)
+; RV64-NEXT: add t5, sp, t5
+; RV64-NEXT: addi t5, t5, 32
+; RV64-NEXT: vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vle64.v v8, (t4)
; RV64-NEXT: vle64.v v16, (t2)
; RV64-NEXT: vle64.v v24, (a1)
; RV64-NEXT: csrr t2, vlenb
-; RV64-NEXT: li t5, 48
-; RV64-NEXT: mul t2, t2, t5
+; RV64-NEXT: li t4, 48
+; RV64-NEXT: mul t2, t2, t4
; RV64-NEXT: add t2, sp, t2
; RV64-NEXT: addi t2, t2, 32
; RV64-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -606,8 +619,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add t2, sp, t2
; RV64-NEXT: addi t2, t2, 32
; RV64-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: and t2, t1, t0
-; RV64-NEXT: and t1, s0, t4
+; RV64-NEXT: and t2, t0, t1
+; RV64-NEXT: and t1, t6, s0
; RV64-NEXT: addi a1, a1, 256
; RV64-NEXT: mv t0, a4
; RV64-NEXT: bltu a4, a3, .LBB16_8
@@ -644,10 +657,10 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add a5, sp, a5
; RV64-NEXT: addi a5, a5, 32
; RV64-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT: addi a5, t0, -16
-; RV64-NEXT: sltu t0, t0, a5
-; RV64-NEXT: addi t0, t0, -1
-; RV64-NEXT: and a5, t0, a5
+; RV64-NEXT: sltiu a5, t0, 17
+; RV64-NEXT: addi t0, t0, -16
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a5, a5, t0
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a1)
; RV64-NEXT: addi a1, sp, 32
@@ -727,7 +740,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add a4, sp, a4
; RV64-NEXT: addi a4, a4, 32
; RV64-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT: addi a4, a1, -16
+; RV64-NEXT: sltiu a4, a1, 17
; RV64-NEXT: csrr a5, vlenb
; RV64-NEXT: li a6, 56
; RV64-NEXT: mul a5, a5, a6
@@ -760,9 +773,9 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add a5, sp, a5
; RV64-NEXT: addi a5, a5, 32
; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT: sltu a1, a1, a4
-; RV64-NEXT: addi a1, a1, -1
-; RV64-NEXT: and a1, a1, a4
+; RV64-NEXT: neg a4, a4
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: and a1, a4, a1
; RV64-NEXT: csrr a4, vlenb
; RV64-NEXT: slli a4, a4, 5
; RV64-NEXT: add a4, sp, a4
@@ -786,17 +799,17 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV64-NEXT: vslideup.vi v24, v8, 16
; RV64-NEXT: vse32.v v24, (a0)
-; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 48
+; RV64-NEXT: li a3, 56
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 32
; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: vse32.v v8, (a1)
-; RV64-NEXT: addi a1, a0, 128
+; RV64-NEXT: addi a1, a0, 256
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 56
+; RV64-NEXT: li a3, 48
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 32
@@ -837,10 +850,10 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext
; CHECK-NEXT: .LBB17_2:
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v24, v16, 0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
index 3d1febe95421f..cde3f21947824 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vuitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vuitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 96dff2464e501..3fc3b47113a32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -354,10 +354,10 @@ define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: addi a0, a1, -128
-; CHECK-NEXT: sltu a3, a1, a0
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: sltiu a0, a1, 129
+; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t
; CHECK-NEXT: bltu a1, a2, .LBB32_2
@@ -383,10 +383,10 @@ define <256 x i8> @vadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB33_2:
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1
; CHECK-NEXT: ret
@@ -1328,10 +1328,10 @@ define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; CHECK-NEXT: .LBB108_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t
@@ -1351,10 +1351,10 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB109_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index da26c63b61e34..f2e051ee41ccb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -453,10 +453,10 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsgnj.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -488,10 +488,10 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsgnj.vv v8, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsgnj.vv v16, v16, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
index 2774aba974a29..12c7009e43a44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
@@ -621,10 +621,10 @@ define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v16, v0.t
@@ -644,10 +644,10 @@ define <32 x double> @vfabs_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index f28b970f48ff7..e863e141376e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -855,10 +855,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -898,27 +898,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: addi a1, a2, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: addi a2, a0, 128
-; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vle64.v v24, (a2)
; CHECK-NEXT: vle64.v v0, (a0)
; CHECK-NEXT: li a1, 16
@@ -927,31 +921,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a0, 16
; CHECK-NEXT: .LBB51_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v0, v8, v16
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v24, v16, v8
+; CHECK-NEXT: vfmadd.vv v24, v8, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: vmv.v.v v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index 403d0b8d57940..484389e29bed9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB27_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmax.vv v8, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmax.vv v16, v16, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 56f7a8d48c5a1..92564e229bccc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmin.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB27_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmin.vv v8, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmin.vv v16, v16, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index a9857880b5942..5298b186f2d25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -627,10 +627,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -670,27 +670,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: addi a1, a2, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: addi a2, a0, 128
-; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vle64.v v24, (a2)
; CHECK-NEXT: vle64.v v0, (a0)
; CHECK-NEXT: li a1, 16
@@ -699,31 +693,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a0, 16
; CHECK-NEXT: .LBB51_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v0, v8, v16
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v24, v16, v8
+; CHECK-NEXT: vfmadd.vv v24, v8, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: vmv.v.v v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
index 84a89b23bc3b5..2b09bd9a22b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
@@ -589,10 +589,10 @@ define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v16, v16, v0.t
@@ -612,10 +612,10 @@ define <32 x double> @vfneg_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
index b431d4873fa1b..9f72f786591a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
@@ -361,10 +361,10 @@ define <32 x double> @vfsqrt_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zero
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v16, v16, v0.t
@@ -384,10 +384,10 @@ define <32 x double> @vfsqrt_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %
; CHECK-NEXT: .LBB27_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index f5978de080082..aa7c3d5e113d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmax.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmax_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmax.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmax.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 7450a70df66ba..3d6dc76d5e70d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vmaxu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index 31d19304c2909..5000bea58fa36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmin.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmin_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmin.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmin.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmin.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmin.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index dda69ec8a7d2e..42b05a295e50e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vminu.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vminu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vminu.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vminu.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vminu.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vminu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 3f5751aaa2cad..071a726604787 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -285,16 +285,16 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %
; RV64-NEXT: vsext.vf8 v16, v8
; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t
-; RV64-NEXT: addi a2, a1, -16
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma
; RV64-NEXT: vslidedown.vi v8, v8, 16
-; RV64-NEXT: sltu a1, a1, a2
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vsext.vf8 v16, v8
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: addi a1, a1, -1
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t
; RV64-NEXT: li a0, 32
@@ -1997,12 +1997,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
; RV32-NEXT: .LBB94_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v24, (zero), v8, v0.t
-; RV32-NEXT: addi a1, a0, -16
+; RV32-NEXT: sltiu a1, a0, 17
+; RV32-NEXT: addi a0, a0, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a0, a1, a0
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v8, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -2020,12 +2020,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
; RV64-NEXT: .LBB94_2:
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (zero), v8, v0.t
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a1, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: and a0, a1, a0
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (zero), v16, v0.t
; RV64-NEXT: ret
@@ -2048,12 +2048,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
; RV32-NEXT: .LBB95_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2077,12 +2077,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
; RV64-NEXT: .LBB95_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2106,12 +2106,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV32-NEXT: .LBB96_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2136,12 +2136,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV64-NEXT: .LBB96_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2168,12 +2168,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV32-NEXT: vluxei16.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei16.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2194,12 +2194,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV64-NEXT: vluxei16.v v8, (a0), v16, v0.t
; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma
; RV64-NEXT: vslidedown.vi v24, v16, 16
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei16.v v16, (a0), v24, v0.t
; RV64-NEXT: ret
@@ -2226,12 +2226,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2253,12 +2253,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
; RV64-NEXT: .LBB98_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2284,12 +2284,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2312,12 +2312,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
; RV64-NEXT: .LBB99_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2344,12 +2344,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2370,12 +2370,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
; RV64-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV64-NEXT: vslidedown.vi v24, v16, 16
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV64-NEXT: ret
@@ -2399,12 +2399,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
; RV32-NEXT: .LBB101_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2427,12 +2427,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
; RV64-NEXT: .LBB101_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2455,12 +2455,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
; RV32-NEXT: .LBB102_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2483,12 +2483,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
; RV64-NEXT: .LBB102_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2512,12 +2512,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
; RV32-NEXT: .LBB103_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2540,12 +2540,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
; RV64-NEXT: .LBB103_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2575,12 +2575,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2598,12 +2598,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
; RV64-NEXT: .LBB104_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index d058669c103f3..8e50dffcaf31c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -325,12 +325,12 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
; CHECK-NEXT: .LBB31_2:
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0), v0.t
-; CHECK-NEXT: addi a2, a1, -16
+; CHECK-NEXT: sltiu a2, a1, 17
+; CHECK-NEXT: addi a1, a1, -16
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a0), v0.t
@@ -352,15 +352,15 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: .LBB32_2:
-; CHECK-NEXT: addi a5, a3, -16
+; CHECK-NEXT: sltiu a5, a3, 17
+; CHECK-NEXT: addi a3, a3, -16
; CHECK-NEXT: addi a4, a1, 128
-; CHECK-NEXT: addi a7, a2, -32
-; CHECK-NEXT: sltu a3, a3, a5
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a6, a3, a5
-; CHECK-NEXT: sltu a3, a2, a7
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a5, a3, a7
+; CHECK-NEXT: sltiu a7, a2, 33
+; CHECK-NEXT: neg a5, a5
+; CHECK-NEXT: and a6, a5, a3
+; CHECK-NEXT: addi a3, a2, -32
+; CHECK-NEXT: neg a5, a7
+; CHECK-NEXT: and a5, a5, a3
; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v8, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index 0bacb5c26cb4a..3a36cda6dd04a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -1306,12 +1306,12 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
; CHECK-NEXT: .LBB83_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma
; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma
@@ -1339,12 +1339,12 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1>
; CHECK-NEXT: .LBB84_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma
; CHECK-NEXT: vfmerge.vfm v16, v16, fa0, v0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index b4d20d93f2a1c..e509b390a3067 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1703,12 +1703,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
; RV32-NEXT: .LBB83_2:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t
-; RV32-NEXT: addi a0, a1, -16
+; RV32-NEXT: sltiu a0, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a0
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a0, a1, a0
+; RV32-NEXT: neg a0, a0
+; RV32-NEXT: and a0, a0, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1737,12 +1737,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
; RV64-NEXT: .LBB83_2:
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t
-; RV64-NEXT: addi a0, a2, -16
-; RV64-NEXT: sltu a1, a2, a0
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a0, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a0, a0
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a0, a1, a0
+; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1773,12 +1773,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
; RV32-NEXT: .LBB84_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT: addi a1, a2, -16
+; RV32-NEXT: sltiu a1, a2, 17
+; RV32-NEXT: addi a2, a2, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a2, a2, a1
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1819,12 +1819,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT: addi a1, a2, -16
-; RV64-NEXT: sltu a2, a2, a1
-; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: sltiu a1, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a1, a1, a2
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
@@ -1859,12 +1859,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV32-NEXT: .LBB85_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT: addi a1, a2, -16
+; RV32-NEXT: sltiu a1, a2, 17
+; RV32-NEXT: addi a2, a2, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a2, a2, a1
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1905,12 +1905,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT: addi a1, a2, -16
-; RV64-NEXT: sltu a2, a2, a1
-; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: sltiu a1, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a1, a1, a2
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
@@ -1946,12 +1946,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV32-NEXT: .LBB86_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT: addi a1, a2, -16
+; RV32-NEXT: sltiu a1, a2, 17
+; RV32-NEXT: addi a2, a2, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a2, a2, a1
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1992,12 +1992,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT: addi a1, a2, -16
-; RV64-NEXT: sltu a2, a2, a1
-; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: sltiu a1, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a1, a1, a2
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index 855a87d21b7dc..b4e402caf5ba4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -255,12 +255,12 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
; CHECK-NEXT: .LBB24_2:
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a0), v0.t
-; CHECK-NEXT: addi a2, a1, -16
+; CHECK-NEXT: sltiu a2, a1, 17
+; CHECK-NEXT: addi a1, a1, -16
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v16, (a0), v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index acaa1e6fa002d..495049e51fb64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -363,10 +363,10 @@ define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: addi a0, a1, -128
-; CHECK-NEXT: sltu a3, a1, a0
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: sltiu a0, a1, 129
+; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t
; CHECK-NEXT: bltu a1, a2, .LBB32_2
@@ -392,10 +392,10 @@ define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB33_2:
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vsadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1
; CHECK-NEXT: ret
@@ -1335,10 +1335,10 @@ define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: .LBB108_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t
@@ -1358,10 +1358,10 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB109_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index 9b3b8348d9b30..a5f57c24aaaaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -359,10 +359,10 @@ define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: addi a0, a1, -128
-; CHECK-NEXT: sltu a3, a1, a0
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: sltiu a0, a1, 129
+; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t
; CHECK-NEXT: bltu a1, a2, .LBB32_2
@@ -388,10 +388,10 @@ define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB33_2:
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1
; CHECK-NEXT: ret
@@ -1331,10 +1331,10 @@ define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
; CHECK-NEXT: .LBB108_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t
@@ -1354,10 +1354,10 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB109_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index f2f9f90f386c0..e91477a622b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -143,15 +143,15 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
; CHECK-NEXT: vmv1r.v v6, v8
; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a4, a1, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vle8.v v24, (a0)
-; CHECK-NEXT: addi a0, a3, -128
-; CHECK-NEXT: vle8.v v8, (a4)
-; CHECK-NEXT: sltu a4, a3, a0
+; CHECK-NEXT: addi a0, a1, 128
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: sltiu a0, a3, 129
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vle8.v v16, (a1)
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a0, a4, a0
+; CHECK-NEXT: addi a1, a3, -128
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
@@ -342,12 +342,12 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -511,12 +511,12 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: addi a0, a2, -32
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: sltiu a0, a2, 33
+; CHECK-NEXT: addi a2, a2, -32
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 4
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 4c7d312e8e785..0947e39ce87e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -373,12 +373,12 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: sltiu a3, a1, 129
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: sltu a0, a1, a3
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a3, a0, a3
+; CHECK-NEXT: addi a0, a1, -128
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a3, a3, a0
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t
@@ -406,10 +406,10 @@ define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vssub.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a2
; CHECK-NEXT: ret
@@ -1376,10 +1376,10 @@ define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a2, v0.t
@@ -1400,10 +1400,10 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a2
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index f9000a1b88a6d..12fef2f06bfcf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -368,12 +368,12 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: sltiu a3, a1, 129
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: sltu a0, a1, a3
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a3, a0, a3
+; CHECK-NEXT: addi a0, a1, -128
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a3, a3, a0
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t
@@ -401,10 +401,10 @@ define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vssubu.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a2
; CHECK-NEXT: ret
@@ -1371,10 +1371,10 @@ define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a2, v0.t
@@ -1395,10 +1395,10 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a2
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
index e2d9e0ac2deea..0bdbf1bb54074 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vzext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
; CHECK-NEXT: .LBB12_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vzext.vf2 v16, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vzext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vzext.vf2 v24, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index e2deefa26ecb3..0ed12ddbb0f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 0e0c92b150d33..33ae7ca7d7847 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v8, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 86ed239e99373..173ea25335375 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v8, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index 736dd1225da88..cbccc96f43cbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -958,7 +958,7 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a6, a2, a3
; CHECK-NEXT: vl8re64.v v8, (a6)
-; CHECK-NEXT: sltu a6, a4, a5
+; CHECK-NEXT: sltu a6, a1, a4
; CHECK-NEXT: addi a6, a6, -1
; CHECK-NEXT: and a5, a6, a5
; CHECK-NEXT: srli a6, a1, 3
@@ -1059,7 +1059,7 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: add a3, a2, a5
; CHECK-NEXT: vl8re64.v v8, (a3)
-; CHECK-NEXT: sltu a3, a4, a6
+; CHECK-NEXT: sltu a3, a1, a4
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a6, a3, a6
; CHECK-NEXT: li a3, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
index c0a794afac3ae..c9478d65058f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -57,7 +57,7 @@ define <vscale x 16 x i64> @llrint_nxv16i64_nxv16f32(<vscale x 16 x float> %x, <
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
index c09df1a60d2ae..4136bab37bc9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -119,7 +119,7 @@ define <vscale x 16 x iXLen> @lrint_nxv16f32(<vscale x 16 x float> %x, <vscale x
; RV64-i64-NEXT: srli a2, a1, 3
; RV64-i64-NEXT: sub a3, a0, a1
; RV64-i64-NEXT: vslidedown.vx v0, v0, a2
-; RV64-i64-NEXT: sltu a2, a0, a3
+; RV64-i64-NEXT: sltu a2, a1, a0
; RV64-i64-NEXT: addi a2, a2, -1
; RV64-i64-NEXT: and a2, a2, a3
; RV64-i64-NEXT: vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 67e7f7c7fbd42..236ba9096f4f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
index 1ee7e138654b9..3e9c669106a26 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
@@ -24274,7 +24274,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24302,7 +24302,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24335,7 +24335,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24363,7 +24363,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24396,7 +24396,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24424,7 +24424,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24457,7 +24457,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24485,7 +24485,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24517,7 +24517,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24545,7 +24545,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24586,10 +24586,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24614,10 +24614,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24647,10 +24647,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24675,10 +24675,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24708,10 +24708,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24736,10 +24736,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24769,10 +24769,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24797,10 +24797,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24829,10 +24829,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24857,10 +24857,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -25538,7 +25538,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25566,7 +25566,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25599,7 +25599,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25627,7 +25627,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25660,7 +25660,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25688,7 +25688,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25721,7 +25721,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25749,7 +25749,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25781,7 +25781,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25809,7 +25809,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25850,10 +25850,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -25878,10 +25878,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -25911,10 +25911,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -25939,10 +25939,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -25972,10 +25972,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -26000,10 +26000,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -26033,10 +26033,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -26061,10 +26061,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -26093,10 +26093,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -26121,10 +26121,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -26802,7 +26802,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26829,7 +26829,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26861,7 +26861,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26888,7 +26888,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26920,7 +26920,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26947,7 +26947,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26979,7 +26979,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27006,7 +27006,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27037,7 +27037,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27064,7 +27064,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27104,10 +27104,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27132,10 +27132,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27165,10 +27165,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27193,10 +27193,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27226,10 +27226,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27254,10 +27254,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27287,10 +27287,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27315,10 +27315,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27347,10 +27347,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27375,10 +27375,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28056,7 +28056,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28083,7 +28083,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28115,7 +28115,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28142,7 +28142,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28174,7 +28174,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28201,7 +28201,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28233,7 +28233,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28260,7 +28260,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28291,7 +28291,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28318,7 +28318,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28358,10 +28358,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28386,10 +28386,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28419,10 +28419,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28447,10 +28447,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28480,10 +28480,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28508,10 +28508,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28541,10 +28541,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28569,10 +28569,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28601,10 +28601,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28629,10 +28629,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -29322,12 +29322,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB850_2
@@ -29345,7 +29345,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB850_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29374,7 +29374,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29406,12 +29406,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB850_2
@@ -29429,7 +29429,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB850_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29458,7 +29458,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29495,12 +29495,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB851_2
@@ -29518,7 +29518,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB851_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29547,7 +29547,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29579,12 +29579,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB851_2
@@ -29602,7 +29602,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB851_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29631,7 +29631,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29668,12 +29668,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB852_2
@@ -29691,7 +29691,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB852_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29720,7 +29720,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29752,12 +29752,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB852_2
@@ -29775,7 +29775,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB852_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29804,7 +29804,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29841,12 +29841,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB853_2
@@ -29864,7 +29864,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB853_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29893,7 +29893,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29925,12 +29925,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB853_2
@@ -29948,7 +29948,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB853_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29977,7 +29977,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -30013,12 +30013,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB854_2
@@ -30036,7 +30036,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB854_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30065,7 +30065,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -30097,12 +30097,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB854_2
@@ -30120,7 +30120,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB854_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30149,7 +30149,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -30226,13 +30226,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30244,10 +30244,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30277,10 +30277,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -30342,13 +30342,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -30360,10 +30360,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30393,10 +30393,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -30463,13 +30463,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30481,10 +30481,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30514,10 +30514,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -30579,13 +30579,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -30597,10 +30597,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30630,10 +30630,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -30700,13 +30700,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30718,10 +30718,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30751,10 +30751,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -30816,13 +30816,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -30834,10 +30834,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30867,10 +30867,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -30937,13 +30937,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30955,10 +30955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30988,10 +30988,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -31053,13 +31053,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -31071,10 +31071,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -31104,10 +31104,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -31173,13 +31173,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -31191,10 +31191,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -31224,10 +31224,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -31289,13 +31289,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -31307,10 +31307,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -31340,10 +31340,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -32026,12 +32026,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB880_2
@@ -32049,7 +32049,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB880_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32078,7 +32078,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32110,12 +32110,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB880_2
@@ -32133,7 +32133,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB880_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32162,7 +32162,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32199,12 +32199,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB881_2
@@ -32222,7 +32222,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB881_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32251,7 +32251,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32283,12 +32283,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB881_2
@@ -32306,7 +32306,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB881_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32335,7 +32335,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32372,12 +32372,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB882_2
@@ -32395,7 +32395,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB882_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32424,7 +32424,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32456,12 +32456,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB882_2
@@ -32479,7 +32479,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB882_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32508,7 +32508,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32545,12 +32545,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB883_2
@@ -32568,7 +32568,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB883_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32597,7 +32597,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32629,12 +32629,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB883_2
@@ -32652,7 +32652,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB883_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32681,7 +32681,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32717,12 +32717,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB884_2
@@ -32740,7 +32740,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB884_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32769,7 +32769,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32801,12 +32801,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB884_2
@@ -32824,7 +32824,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB884_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32853,7 +32853,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32907,13 +32907,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -32925,10 +32925,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32955,10 +32955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -32997,13 +32997,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33015,10 +33015,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33045,10 +33045,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33092,13 +33092,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33110,10 +33110,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33140,10 +33140,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33182,13 +33182,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33200,10 +33200,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33230,10 +33230,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33277,13 +33277,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33295,10 +33295,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33325,10 +33325,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33367,13 +33367,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33385,10 +33385,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33415,10 +33415,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33462,13 +33462,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33480,10 +33480,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33510,10 +33510,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33552,13 +33552,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33570,10 +33570,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33600,10 +33600,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33646,13 +33646,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33664,10 +33664,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33694,10 +33694,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33736,13 +33736,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33754,10 +33754,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33784,10 +33784,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -34527,30 +34527,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB910_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB910_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB910_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB910_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB910_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB910_6
@@ -34560,26 +34560,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB910_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -34589,11 +34589,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB910_10
@@ -34619,7 +34619,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB910_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -34674,45 +34674,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB910_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB910_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB910_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB910_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB910_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB910_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB910_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB910_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB910_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -34800,33 +34800,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB910_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB910_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB910_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB910_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB910_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB910_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -34834,7 +34834,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -34842,7 +34842,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -34853,21 +34853,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB910_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB910_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB910_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB910_10
@@ -34893,7 +34893,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB910_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -34948,45 +34948,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB910_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB910_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB910_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB910_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB910_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB910_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB910_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB910_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB910_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35080,30 +35080,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB911_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB911_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB911_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB911_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB911_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB911_6
@@ -35113,26 +35113,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB911_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -35142,11 +35142,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB911_10
@@ -35172,7 +35172,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB911_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -35227,45 +35227,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB911_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB911_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB911_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB911_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB911_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB911_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB911_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB911_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB911_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35353,33 +35353,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB911_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB911_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB911_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB911_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB911_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB911_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -35387,7 +35387,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35395,7 +35395,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35406,21 +35406,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB911_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB911_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB911_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB911_10
@@ -35446,7 +35446,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB911_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -35501,45 +35501,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB911_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB911_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB911_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB911_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB911_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB911_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB911_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB911_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB911_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35633,30 +35633,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB912_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB912_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB912_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB912_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB912_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB912_6
@@ -35666,26 +35666,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB912_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -35695,11 +35695,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB912_10
@@ -35725,7 +35725,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB912_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -35780,45 +35780,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB912_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB912_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB912_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB912_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB912_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB912_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB912_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB912_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB912_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35906,33 +35906,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB912_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB912_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB912_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB912_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB912_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB912_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -35940,7 +35940,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35948,7 +35948,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35959,21 +35959,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB912_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB912_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB912_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB912_10
@@ -35999,7 +35999,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB912_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -36054,45 +36054,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB912_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB912_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB912_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB912_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB912_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB912_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB912_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB912_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB912_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36186,30 +36186,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB913_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB913_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB913_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB913_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB913_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB913_6
@@ -36219,26 +36219,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB913_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -36248,11 +36248,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB913_10
@@ -36278,7 +36278,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB913_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -36333,45 +36333,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB913_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB913_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB913_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB913_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB913_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB913_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB913_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB913_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB913_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36459,33 +36459,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB913_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB913_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB913_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB913_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB913_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB913_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -36493,7 +36493,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -36501,7 +36501,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -36512,21 +36512,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB913_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB913_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB913_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB913_10
@@ -36552,7 +36552,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB913_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -36607,45 +36607,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB913_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB913_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB913_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB913_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB913_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB913_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB913_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB913_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB913_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36738,30 +36738,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB914_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB914_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB914_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB914_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB914_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB914_6
@@ -36771,26 +36771,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB914_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -36800,11 +36800,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB914_10
@@ -36830,7 +36830,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB914_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -36885,45 +36885,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB914_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB914_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB914_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB914_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB914_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB914_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB914_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB914_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB914_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37011,33 +37011,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB914_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB914_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB914_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB914_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB914_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB914_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -37045,7 +37045,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -37053,7 +37053,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -37064,21 +37064,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB914_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB914_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB914_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB914_10
@@ -37104,7 +37104,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB914_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -37159,45 +37159,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB914_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB914_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB914_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB914_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB914_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB914_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB914_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB914_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB914_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37342,9 +37342,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -37360,17 +37360,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -37391,23 +37391,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -37424,7 +37424,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -37493,13 +37493,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -37511,10 +37511,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -37609,102 +37609,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB915_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB915_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB915_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB915_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB915_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB915_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB915_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB915_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB915_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB915_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB915_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB915_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB915_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -37734,7 +37734,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -37803,13 +37803,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -37821,10 +37821,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -37967,9 +37967,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -37985,17 +37985,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -38016,23 +38016,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -38049,7 +38049,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -38118,13 +38118,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -38136,10 +38136,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -38234,102 +38234,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB916_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB916_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB916_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB916_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB916_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB916_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB916_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB916_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB916_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB916_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB916_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB916_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB916_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -38359,7 +38359,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -38428,13 +38428,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -38446,10 +38446,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -38592,9 +38592,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -38610,17 +38610,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -38641,23 +38641,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -38674,7 +38674,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -38743,13 +38743,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -38761,10 +38761,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -38859,102 +38859,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB917_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB917_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB917_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB917_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB917_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB917_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB917_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB917_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB917_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB917_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB917_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB917_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB917_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -38984,7 +38984,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -39053,13 +39053,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -39071,10 +39071,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -39217,9 +39217,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -39235,17 +39235,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -39266,23 +39266,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -39299,7 +39299,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -39368,13 +39368,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -39386,10 +39386,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -39484,102 +39484,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB918_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB918_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB918_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB918_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB918_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB918_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB918_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB918_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB918_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB918_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB918_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB918_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB918_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -39609,7 +39609,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -39678,13 +39678,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -39696,10 +39696,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -39841,9 +39841,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -39859,17 +39859,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -39890,23 +39890,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -39923,7 +39923,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -39992,13 +39992,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -40010,10 +40010,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -40108,102 +40108,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB919_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB919_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB919_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB919_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB919_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB919_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB919_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB919_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB919_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB919_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB919_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB919_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB919_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -40233,7 +40233,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -40302,13 +40302,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -40320,10 +40320,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 380287dd555c9..1c95c753c8ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -263,13 +263,13 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -321,14 +321,14 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat>
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -786,13 +786,13 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -858,14 +858,14 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1751,7 +1751,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1793,7 +1793,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -1834,7 +1834,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFMIN-NEXT: sub a2, a0, a1
; RV32ZVFMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFMIN-NEXT: sltu a3, a1, a0
; RV32ZVFMIN-NEXT: addi a3, a3, -1
; RV32ZVFMIN-NEXT: and a2, a3, a2
; RV32ZVFMIN-NEXT: vmv1r.v v0, v6
@@ -1876,7 +1876,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -1917,7 +1917,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1947,7 +1947,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1976,7 +1976,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFMIN-NEXT: sub a3, a0, a1
; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFMIN-NEXT: sltu a2, a1, a0
; RV32ZVFMIN-NEXT: addi a2, a2, -1
; RV32ZVFMIN-NEXT: and a2, a2, a3
; RV32ZVFMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2006,7 +2006,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 37c036d38148a..605b07c81f45a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index 37a9ec1c0a8aa..6869bc2050698 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 5553b988fec97..8869a440c8634 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 634e58198def3..b67ab5c3c9efa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1337,211 +1337,404 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
}
define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: fcmp_oeq_vv_nxv64bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv8r.v v0, v16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a1, a3, 3
-; CHECK-NEXT: slli a5, a3, 2
-; CHECK-NEXT: slli a4, a3, 1
-; CHECK-NEXT: add a1, a0, a1
-; CHECK-NEXT: sub a6, a2, a5
-; CHECK-NEXT: vl8re16.v v24, (a1)
-; CHECK-NEXT: sltu a1, a2, a6
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a6, a1, a6
-; CHECK-NEXT: sub a1, a6, a4
-; CHECK-NEXT: sltu a7, a6, a1
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: and a7, a7, a1
-; CHECK-NEXT: srli a1, a3, 1
-; CHECK-NEXT: srli a3, a3, 2
-; CHECK-NEXT: csrr t0, vlenb
-; CHECK-NEXT: slli t0, t0, 1
-; CHECK-NEXT: mv t1, t0
-; CHECK-NEXT: slli t0, t0, 2
-; CHECK-NEXT: add t1, t1, t0
-; CHECK-NEXT: slli t0, t0, 1
-; CHECK-NEXT: add t0, t0, t1
-; CHECK-NEXT: add t0, sp, t0
-; CHECK-NEXT: addi t0, t0, 16
-; CHECK-NEXT: vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vslidedown.vx v16, v8, a1
-; CHECK-NEXT: vl8re16.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv t0, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, t0
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v16, a3
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a7, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT: bltu a6, a4, .LBB85_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a6, a4
-; CHECK-NEXT: .LBB85_2:
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v5, v8, v16, v0.t
-; CHECK-NEXT: vsetvli zero, a6, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v7, v24, v16, v0.t
-; CHECK-NEXT: bltu a2, a5, .LBB85_4
-; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv a2, a5
-; CHECK-NEXT: .LBB85_4:
-; CHECK-NEXT: sub a0, a2, a4
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 1
-; CHECK-NEXT: mv a6, a5
-; CHECK-NEXT: slli a5, a5, 2
-; CHECK-NEXT: add a6, a6, a5
-; CHECK-NEXT: slli a5, a5, 1
-; CHECK-NEXT: add a5, a5, a6
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v8, a3
-; CHECK-NEXT: sltu a5, a2, a0
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a0, a5, a0
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 1
-; CHECK-NEXT: mv a6, a5
-; CHECK-NEXT: slli a5, a5, 3
-; CHECK-NEXT: add a5, a5, a6
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a5, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, a5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v10, v16, v24, v0.t
-; CHECK-NEXT: vmv1r.v v9, v7
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v5, a3
-; CHECK-NEXT: bltu a2, a4, .LBB85_6
-; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a2, a4
-; CHECK-NEXT: .LBB85_6:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a4, a0
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, a0, a4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a2, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a2, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a2, a2, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v10, a3
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a1, a1, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a1, a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; CHECK32-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK32: # %bb.0:
+; CHECK32-NEXT: addi sp, sp, -16
+; CHECK32-NEXT: .cfi_def_cfa_offset 16
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: mv a3, a1
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: add a3, a3, a1
+; CHECK32-NEXT: slli a1, a1, 2
+; CHECK32-NEXT: add a3, a3, a1
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: add a1, a1, a3
+; CHECK32-NEXT: sub sp, sp, a1
+; CHECK32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: mv a3, a1
+; CHECK32-NEXT: slli a1, a1, 2
+; CHECK32-NEXT: add a3, a3, a1
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: add a1, a1, a3
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK32-NEXT: vmv8r.v v0, v16
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: mv a3, a1
+; CHECK32-NEXT: slli a1, a1, 3
+; CHECK32-NEXT: add a1, a1, a3
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: csrr a3, vlenb
+; CHECK32-NEXT: srli a1, a3, 1
+; CHECK32-NEXT: slli a4, a3, 3
+; CHECK32-NEXT: slli a6, a3, 2
+; CHECK32-NEXT: slli a5, a3, 1
+; CHECK32-NEXT: add a4, a0, a4
+; CHECK32-NEXT: sub a7, a2, a6
+; CHECK32-NEXT: sltu t0, a6, a2
+; CHECK32-NEXT: vl8re16.v v24, (a4)
+; CHECK32-NEXT: addi t0, t0, -1
+; CHECK32-NEXT: and a7, t0, a7
+; CHECK32-NEXT: sub a4, a7, a5
+; CHECK32-NEXT: sltu t0, a5, a7
+; CHECK32-NEXT: addi t0, t0, -1
+; CHECK32-NEXT: and t0, t0, a4
+; CHECK32-NEXT: srli a4, a3, 2
+; CHECK32-NEXT: csrr t1, vlenb
+; CHECK32-NEXT: slli t1, t1, 1
+; CHECK32-NEXT: mv t2, t1
+; CHECK32-NEXT: slli t1, t1, 2
+; CHECK32-NEXT: add t2, t2, t1
+; CHECK32-NEXT: slli t1, t1, 1
+; CHECK32-NEXT: add t1, t1, t2
+; CHECK32-NEXT: add t1, sp, t1
+; CHECK32-NEXT: addi t1, t1, 16
+; CHECK32-NEXT: vl1r.v v8, (t1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vslidedown.vx v16, v8, a1
+; CHECK32-NEXT: vl8re16.v v8, (a0)
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv t1, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a0, a0, t1
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: vslidedown.vx v8, v16, a4
+; CHECK32-NEXT: addi a0, sp, 16
+; CHECK32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: vsetvli zero, t0, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v8, v4
+; CHECK32-NEXT: bltu a7, a5, .LBB85_2
+; CHECK32-NEXT: # %bb.1:
+; CHECK32-NEXT: mv a7, a5
+; CHECK32-NEXT: .LBB85_2:
+; CHECK32-NEXT: addi a0, sp, 16
+; CHECK32-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v5, v8, v16, v0.t
+; CHECK32-NEXT: vsetvli zero, a7, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v7, v24, v16, v0.t
+; CHECK32-NEXT: bltu a2, a6, .LBB85_4
+; CHECK32-NEXT: # %bb.3:
+; CHECK32-NEXT: mv a2, a6
+; CHECK32-NEXT: .LBB85_4:
+; CHECK32-NEXT: sub a0, a2, a5
+; CHECK32-NEXT: sltu a6, a5, a2
+; CHECK32-NEXT: csrr a7, vlenb
+; CHECK32-NEXT: slli a7, a7, 1
+; CHECK32-NEXT: mv t0, a7
+; CHECK32-NEXT: slli a7, a7, 2
+; CHECK32-NEXT: add t0, t0, a7
+; CHECK32-NEXT: slli a7, a7, 1
+; CHECK32-NEXT: add a7, a7, t0
+; CHECK32-NEXT: add a7, sp, a7
+; CHECK32-NEXT: addi a7, a7, 16
+; CHECK32-NEXT: vl1r.v v8, (a7) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli a7, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: vslidedown.vx v0, v8, a4
+; CHECK32-NEXT: addi a6, a6, -1
+; CHECK32-NEXT: and a0, a6, a0
+; CHECK32-NEXT: csrr a6, vlenb
+; CHECK32-NEXT: slli a6, a6, 1
+; CHECK32-NEXT: mv a7, a6
+; CHECK32-NEXT: slli a6, a6, 3
+; CHECK32-NEXT: add a6, a6, a7
+; CHECK32-NEXT: add a6, sp, a6
+; CHECK32-NEXT: addi a6, a6, 16
+; CHECK32-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a6, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a0, a0, a6
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v10, v16, v24, v0.t
+; CHECK32-NEXT: vmv1r.v v9, v7
+; CHECK32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT: vslideup.vx v9, v5, a4
+; CHECK32-NEXT: bltu a2, a5, .LBB85_6
+; CHECK32-NEXT: # %bb.5:
+; CHECK32-NEXT: mv a2, a5
+; CHECK32-NEXT: .LBB85_6:
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a5, a0
+; CHECK32-NEXT: slli a0, a0, 3
+; CHECK32-NEXT: add a0, a0, a5
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a2, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a0, a0, a2
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a2, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a2, a2, a0
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, a0, a2
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT: vslideup.vx v8, v10, a4
+; CHECK32-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK32-NEXT: vslideup.vx v8, v9, a1
+; CHECK32-NEXT: vmv.v.v v0, v8
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: mv a1, a0
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a1, a1, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a1, a1, a0
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, a0, a1
+; CHECK32-NEXT: add sp, sp, a0
+; CHECK32-NEXT: .cfi_def_cfa sp, 16
+; CHECK32-NEXT: addi sp, sp, 16
+; CHECK32-NEXT: .cfi_def_cfa_offset 0
+; CHECK32-NEXT: ret
+;
+; CHECK64-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: addi sp, sp, -16
+; CHECK64-NEXT: .cfi_def_cfa_offset 16
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: mv a3, a1
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: add a3, a3, a1
+; CHECK64-NEXT: slli a1, a1, 2
+; CHECK64-NEXT: add a3, a3, a1
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: add a1, a1, a3
+; CHECK64-NEXT: sub sp, sp, a1
+; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: mv a3, a1
+; CHECK64-NEXT: slli a1, a1, 3
+; CHECK64-NEXT: add a1, a1, a3
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK64-NEXT: vmv8r.v v0, v16
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: mv a3, a1
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: add a3, a3, a1
+; CHECK64-NEXT: slli a1, a1, 3
+; CHECK64-NEXT: add a1, a1, a3
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: csrr a3, vlenb
+; CHECK64-NEXT: slli a1, a3, 3
+; CHECK64-NEXT: slli a5, a3, 2
+; CHECK64-NEXT: slli a4, a3, 1
+; CHECK64-NEXT: add a1, a0, a1
+; CHECK64-NEXT: sub a6, a2, a5
+; CHECK64-NEXT: sltu a7, a5, a2
+; CHECK64-NEXT: vl8re16.v v24, (a1)
+; CHECK64-NEXT: addi a7, a7, -1
+; CHECK64-NEXT: and a6, a7, a6
+; CHECK64-NEXT: sub a1, a6, a4
+; CHECK64-NEXT: sltu a7, a4, a6
+; CHECK64-NEXT: addi a7, a7, -1
+; CHECK64-NEXT: and a7, a7, a1
+; CHECK64-NEXT: srli a1, a3, 1
+; CHECK64-NEXT: srli a3, a3, 2
+; CHECK64-NEXT: csrr t0, vlenb
+; CHECK64-NEXT: slli t0, t0, 1
+; CHECK64-NEXT: mv t1, t0
+; CHECK64-NEXT: slli t0, t0, 3
+; CHECK64-NEXT: add t0, t0, t1
+; CHECK64-NEXT: add t0, sp, t0
+; CHECK64-NEXT: addi t0, t0, 16
+; CHECK64-NEXT: vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vslidedown.vx v16, v8, a1
+; CHECK64-NEXT: vl8re16.v v8, (a0)
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv t0, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a0, a0, t0
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslidedown.vx v8, v16, a3
+; CHECK64-NEXT: addi a0, sp, 16
+; CHECK64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: vsetvli zero, a7, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v8, v4
+; CHECK64-NEXT: bltu a6, a4, .LBB85_2
+; CHECK64-NEXT: # %bb.1:
+; CHECK64-NEXT: mv a6, a4
+; CHECK64-NEXT: .LBB85_2:
+; CHECK64-NEXT: addi a0, sp, 16
+; CHECK64-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v5, v8, v16, v0.t
+; CHECK64-NEXT: vsetvli zero, a6, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v6, v24, v16, v0.t
+; CHECK64-NEXT: bltu a2, a5, .LBB85_4
+; CHECK64-NEXT: # %bb.3:
+; CHECK64-NEXT: mv a2, a5
+; CHECK64-NEXT: .LBB85_4:
+; CHECK64-NEXT: sub a0, a2, a4
+; CHECK64-NEXT: sltu a5, a4, a2
+; CHECK64-NEXT: csrr a6, vlenb
+; CHECK64-NEXT: slli a6, a6, 1
+; CHECK64-NEXT: mv a7, a6
+; CHECK64-NEXT: slli a6, a6, 3
+; CHECK64-NEXT: add a6, a6, a7
+; CHECK64-NEXT: add a6, sp, a6
+; CHECK64-NEXT: addi a6, a6, 16
+; CHECK64-NEXT: vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslidedown.vx v0, v7, a3
+; CHECK64-NEXT: addi a5, a5, -1
+; CHECK64-NEXT: and a0, a5, a0
+; CHECK64-NEXT: csrr a5, vlenb
+; CHECK64-NEXT: mv a6, a5
+; CHECK64-NEXT: slli a5, a5, 1
+; CHECK64-NEXT: add a6, a6, a5
+; CHECK64-NEXT: slli a5, a5, 3
+; CHECK64-NEXT: add a5, a5, a6
+; CHECK64-NEXT: add a5, sp, a5
+; CHECK64-NEXT: addi a5, a5, 16
+; CHECK64-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv a5, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a0, a0, a5
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v4, v16, v24, v0.t
+; CHECK64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslideup.vx v6, v5, a3
+; CHECK64-NEXT: bltu a2, a4, .LBB85_6
+; CHECK64-NEXT: # %bb.5:
+; CHECK64-NEXT: mv a2, a4
+; CHECK64-NEXT: .LBB85_6:
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: mv a4, a0
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a4, a4, a0
+; CHECK64-NEXT: slli a0, a0, 3
+; CHECK64-NEXT: add a0, a0, a4
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv a2, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a0, a0, a2
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT: vmv1r.v v0, v7
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslideup.vx v8, v4, a3
+; CHECK64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK64-NEXT: vslideup.vx v8, v6, a1
+; CHECK64-NEXT: vmv.v.v v0, v8
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: mv a1, a0
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a1, a1, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a1, a1, a0
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a0, a0, a1
+; CHECK64-NEXT: add sp, sp, a0
+; CHECK64-NEXT: .cfi_def_cfa sp, 16
+; CHECK64-NEXT: addi sp, sp, 16
+; CHECK64-NEXT: .cfi_def_cfa_offset 0
+; CHECK64-NEXT: ret
%v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
ret <vscale x 64 x i1> %v
}
@@ -3479,257 +3672,6 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
}
define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: .cfi_def_cfa_offset 16
-; ZVFH-NEXT: csrr a1, vlenb
-; ZVFH-NEXT: slli a1, a1, 3
-; ZVFH-NEXT: sub sp, sp, a1
-; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; ZVFH-NEXT: vmv1r.v v7, v0
-; ZVFH-NEXT: addi a1, sp, 16
-; ZVFH-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: csrr a3, vlenb
-; ZVFH-NEXT: srli a1, a3, 1
-; ZVFH-NEXT: slli a4, a3, 3
-; ZVFH-NEXT: slli a3, a3, 2
-; ZVFH-NEXT: add a4, a0, a4
-; ZVFH-NEXT: sub a5, a2, a3
-; ZVFH-NEXT: vl8re16.v v24, (a4)
-; ZVFH-NEXT: sltu a4, a2, a5
-; ZVFH-NEXT: addi a4, a4, -1
-; ZVFH-NEXT: vl8re16.v v8, (a0)
-; ZVFH-NEXT: vslidedown.vx v0, v0, a1
-; ZVFH-NEXT: and a4, a4, a5
-; ZVFH-NEXT: vsetvli zero, a4, e16, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v6, v16, v24, v0.t
-; ZVFH-NEXT: bltu a2, a3, .LBB171_2
-; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: mv a2, a3
-; ZVFH-NEXT: .LBB171_2:
-; ZVFH-NEXT: vmv1r.v v0, v7
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v16, v24, v8, v0.t
-; ZVFH-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; ZVFH-NEXT: vslideup.vx v16, v6, a1
-; ZVFH-NEXT: vmv.v.v v0, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: .cfi_def_cfa sp, 16
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: .cfi_def_cfa_offset 0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a3, a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a3, a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a3, a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v0, v16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, a1, a3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a1, a3, 3
-; ZVFHMIN-NEXT: slli a5, a3, 2
-; ZVFHMIN-NEXT: slli a4, a3, 1
-; ZVFHMIN-NEXT: add a1, a0, a1
-; ZVFHMIN-NEXT: sub a6, a2, a5
-; ZVFHMIN-NEXT: vl8re16.v v24, (a1)
-; ZVFHMIN-NEXT: sltu a1, a2, a6
-; ZVFHMIN-NEXT: addi a1, a1, -1
-; ZVFHMIN-NEXT: and a6, a1, a6
-; ZVFHMIN-NEXT: sub a1, a6, a4
-; ZVFHMIN-NEXT: sltu a7, a6, a1
-; ZVFHMIN-NEXT: addi a7, a7, -1
-; ZVFHMIN-NEXT: and a7, a7, a1
-; ZVFHMIN-NEXT: srli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: csrr t0, vlenb
-; ZVFHMIN-NEXT: slli t0, t0, 1
-; ZVFHMIN-NEXT: mv t1, t0
-; ZVFHMIN-NEXT: slli t0, t0, 2
-; ZVFHMIN-NEXT: add t1, t1, t0
-; ZVFHMIN-NEXT: slli t0, t0, 1
-; ZVFHMIN-NEXT: add t0, t0, t1
-; ZVFHMIN-NEXT: add t0, sp, t0
-; ZVFHMIN-NEXT: addi t0, t0, 16
-; ZVFHMIN-NEXT: vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vslidedown.vx v16, v8, a1
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv t0, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, t0
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v8, v16, a3
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a7, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT: bltu a6, a4, .LBB171_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a6, a4
-; ZVFHMIN-NEXT: .LBB171_2:
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v5, v8, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, a6, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v16, v0.t
-; ZVFHMIN-NEXT: bltu a2, a5, .LBB171_4
-; ZVFHMIN-NEXT: # %bb.3:
-; ZVFHMIN-NEXT: mv a2, a5
-; ZVFHMIN-NEXT: .LBB171_4:
-; ZVFHMIN-NEXT: sub a0, a2, a4
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 1
-; ZVFHMIN-NEXT: mv a6, a5
-; ZVFHMIN-NEXT: slli a5, a5, 2
-; ZVFHMIN-NEXT: add a6, a6, a5
-; ZVFHMIN-NEXT: slli a5, a5, 1
-; ZVFHMIN-NEXT: add a5, a5, a6
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a5, a2, a0
-; ZVFHMIN-NEXT: addi a5, a5, -1
-; ZVFHMIN-NEXT: and a0, a5, a0
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 1
-; ZVFHMIN-NEXT: mv a6, a5
-; ZVFHMIN-NEXT: slli a5, a5, 3
-; ZVFHMIN-NEXT: add a5, a5, a6
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a5, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, a5
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v10, v16, v24, v0.t
-; ZVFHMIN-NEXT: vmv1r.v v9, v7
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v9, v5, a3
-; ZVFHMIN-NEXT: bltu a2, a4, .LBB171_6
-; ZVFHMIN-NEXT: # %bb.5:
-; ZVFHMIN-NEXT: mv a2, a4
-; ZVFHMIN-NEXT: .LBB171_6:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a4, a0
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, a0, a4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a2, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, a2
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a2, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a2, a2, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a2
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v8, v10, a3
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v8, v9, a1
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a1, a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a1, a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
ret <vscale x 64 x i1> %v
}
@@ -4879,7 +4821,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: add a4, s3, a6
; CHECK32-NEXT: vl8re64.v v24, (s3)
; CHECK32-NEXT: sub a6, a3, s0
-; CHECK32-NEXT: sltu a7, a3, a6
+; CHECK32-NEXT: sltu a7, s0, a3
; CHECK32-NEXT: addi a7, a7, -1
; CHECK32-NEXT: and a6, a7, a6
; CHECK32-NEXT: csrr a7, vlenb
@@ -4919,7 +4861,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK32-NEXT: vl8re64.v v16, (a4)
; CHECK32-NEXT: sub a1, s1, a2
-; CHECK32-NEXT: sltu a2, s1, a1
+; CHECK32-NEXT: sltu a2, a2, s1
; CHECK32-NEXT: vl8re64.v v24, (s2)
; CHECK32-NEXT: addi a2, a2, -1
; CHECK32-NEXT: and s1, a2, a1
@@ -4964,7 +4906,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
; CHECK32-NEXT: vslideup.vx v9, v8, s4
; CHECK32-NEXT: sub a1, s1, s0
-; CHECK32-NEXT: sltu a2, s1, a1
+; CHECK32-NEXT: sltu a2, s0, s1
; CHECK32-NEXT: addi a2, a2, -1
; CHECK32-NEXT: and a1, a2, a1
; CHECK32-NEXT: csrr a2, vlenb
@@ -4979,7 +4921,8 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t
-; CHECK32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: srli s0, s0, 1
+; CHECK32-NEXT: vsetvli zero, s0, e8, mf2, ta, ma
; CHECK32-NEXT: vslideup.vx v9, v8, a0
; CHECK32-NEXT: vmv1r.v v0, v9
; CHECK32-NEXT: csrr a0, vlenb
@@ -5090,7 +5033,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK64-NEXT: add a4, s3, a6
; CHECK64-NEXT: vl8re64.v v24, (s3)
; CHECK64-NEXT: sub a6, a3, s0
-; CHECK64-NEXT: sltu a7, a3, a6
+; CHECK64-NEXT: sltu a7, s0, a3
; CHECK64-NEXT: addi a7, a7, -1
; CHECK64-NEXT: and a6, a7, a6
; CHECK64-NEXT: csrr a7, vlenb
@@ -5130,7 +5073,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK64-NEXT: vl8re64.v v16, (a4)
; CHECK64-NEXT: sub a1, s1, a2
-; CHECK64-NEXT: sltu a2, s1, a1
+; CHECK64-NEXT: sltu a2, a2, s1
; CHECK64-NEXT: vl8re64.v v24, (s2)
; CHECK64-NEXT: addi a2, a2, -1
; CHECK64-NEXT: and s1, a2, a1
@@ -5175,7 +5118,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
; CHECK64-NEXT: vslideup.vx v9, v8, s4
; CHECK64-NEXT: sub a1, s1, s0
-; CHECK64-NEXT: sltu a2, s1, a1
+; CHECK64-NEXT: sltu a2, s0, s1
; CHECK64-NEXT: addi a2, a2, -1
; CHECK64-NEXT: and a1, a2, a1
; CHECK64-NEXT: csrr a2, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index c1de57bf850ac..829a3b43bd984 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1083,7 +1083,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: sub a4, a3, a1
; CHECK-NEXT: vl8r.v v24, (a2)
-; CHECK-NEXT: sltu a2, a3, a4
+; CHECK-NEXT: sltu a2, a1, a3
; CHECK-NEXT: vl8r.v v8, (a0)
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a4
@@ -1120,7 +1120,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -1150,7 +1150,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_swap_nxv128i8(<vscale x 128 x i8> %va, i8
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -2195,81 +2195,155 @@ define <vscale x 8 x i1> @icmp_sle_vi_swap_nxv8i32(<vscale x 8 x i32> %va, <vsca
}
define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vv_nxv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: srli a1, a3, 2
-; CHECK-NEXT: slli a4, a3, 3
-; CHECK-NEXT: slli a3, a3, 1
-; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sub a5, a2, a3
-; CHECK-NEXT: vl8re32.v v24, (a4)
-; CHECK-NEXT: sltu a4, a2, a5
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: vl8re32.v v8, (a0)
-; CHECK-NEXT: vslidedown.vx v0, v0, a1
-; CHECK-NEXT: and a4, a4, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vv v6, v16, v24, v0.t
-; CHECK-NEXT: bltu a2, a3, .LBB189_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a2, a3
-; CHECK-NEXT: .LBB189_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vv v16, v24, v8, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v16, v6, a1
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; RV32-LABEL: icmp_eq_vv_nxv32i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v7, v0
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a1, a3, 2
+; RV32-NEXT: slli a5, a3, 3
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: sub a6, a2, a4
+; RV32-NEXT: vl8re32.v v24, (a5)
+; RV32-NEXT: sltu a5, a4, a2
+; RV32-NEXT: addi a5, a5, -1
+; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: vslidedown.vx v0, v0, a1
+; RV32-NEXT: and a0, a5, a6
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vv v6, v16, v24, v0.t
+; RV32-NEXT: bltu a2, a4, .LBB189_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a2, a4
+; RV32-NEXT: .LBB189_2:
+; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vv v16, v24, v8, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v16, v6, a1
+; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: icmp_eq_vv_nxv32i32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v7, v0
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a1, a3, 2
+; RV64-NEXT: slli a4, a3, 3
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: add a4, a0, a4
+; RV64-NEXT: sub a5, a2, a3
+; RV64-NEXT: vl8re32.v v24, (a4)
+; RV64-NEXT: sltu a4, a3, a2
+; RV64-NEXT: addi a4, a4, -1
+; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: vslidedown.vx v0, v0, a1
+; RV64-NEXT: and a4, a4, a5
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vv v6, v16, v24, v0.t
+; RV64-NEXT: bltu a2, a3, .LBB189_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a2, a3
+; RV64-NEXT: .LBB189_2:
+; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vv v16, v24, v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v16, v6, a1
+; RV64-NEXT: vmv1r.v v0, v16
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
%v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x i1> %v
}
define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_nxv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv1r.v v24, v0
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: srli a2, a3, 2
-; CHECK-NEXT: slli a3, a3, 1
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sub a4, a1, a3
-; CHECK-NEXT: sltu a5, a1, a4
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a4, a5, a4
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT: bltu a1, a3, .LBB190_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB190_2:
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v16, v25, a2
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: ret
+; RV32-LABEL: icmp_eq_vx_nxv32i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v24, v0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a2, a3, 2
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: vslidedown.vx v0, v0, a2
+; RV32-NEXT: sub a5, a1, a4
+; RV32-NEXT: sltu a6, a4, a1
+; RV32-NEXT: addi a6, a6, -1
+; RV32-NEXT: and a5, a6, a5
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT: bltu a1, a4, .LBB190_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a1, a4
+; RV32-NEXT: .LBB190_2:
+; RV32-NEXT: vmv1r.v v0, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v16, v25, a2
+; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: icmp_eq_vx_nxv32i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v24, v0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a2, a3, 2
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: vslidedown.vx v0, v0, a2
+; RV64-NEXT: sub a4, a1, a3
+; RV64-NEXT: sltu a5, a3, a1
+; RV64-NEXT: addi a5, a5, -1
+; RV64-NEXT: and a4, a5, a4
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT: bltu a1, a3, .LBB190_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a1, a3
+; RV64-NEXT: .LBB190_2:
+; RV64-NEXT: vmv1r.v v0, v24
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v16, v25, a2
+; RV64-NEXT: vmv1r.v v0, v16
+; RV64-NEXT: ret
%elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
%vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
@@ -2277,31 +2351,58 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
}
define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_swap_nxv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv1r.v v24, v0
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: srli a2, a3, 2
-; CHECK-NEXT: slli a3, a3, 1
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sub a4, a1, a3
-; CHECK-NEXT: sltu a5, a1, a4
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a4, a5, a4
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT: bltu a1, a3, .LBB191_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB191_2:
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v16, v25, a2
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: ret
+; RV32-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v24, v0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a2, a3, 2
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: vslidedown.vx v0, v0, a2
+; RV32-NEXT: sub a5, a1, a4
+; RV32-NEXT: sltu a6, a4, a1
+; RV32-NEXT: addi a6, a6, -1
+; RV32-NEXT: and a5, a6, a5
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT: bltu a1, a4, .LBB191_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a1, a4
+; RV32-NEXT: .LBB191_2:
+; RV32-NEXT: vmv1r.v v0, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v16, v25, a2
+; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v24, v0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a2, a3, 2
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: vslidedown.vx v0, v0, a2
+; RV64-NEXT: sub a4, a1, a3
+; RV64-NEXT: sltu a5, a3, a1
+; RV64-NEXT: addi a5, a5, -1
+; RV64-NEXT: and a4, a5, a4
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT: bltu a1, a3, .LBB191_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a1, a3
+; RV64-NEXT: .LBB191_2:
+; RV64-NEXT: vmv1r.v v0, v24
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v16, v25, a2
+; RV64-NEXT: vmv1r.v v0, v16
+; RV64-NEXT: ret
%elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
%vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %vb, <vscale x 32 x i32> %va, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 6381887a1a2f9..3d34a619ce8bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -595,7 +595,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV32-NEXT: vmv1r.v v9, v0
; CHECK-RV32-NEXT: csrr a4, vlenb
; CHECK-RV32-NEXT: sub a2, a3, a4
-; CHECK-RV32-NEXT: sltu a5, a3, a2
+; CHECK-RV32-NEXT: sltu a5, a4, a3
; CHECK-RV32-NEXT: addi a5, a5, -1
; CHECK-RV32-NEXT: and a2, a5, a2
; CHECK-RV32-NEXT: bltu a3, a4, .LBB55_2
@@ -621,7 +621,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV64-NEXT: vmv1r.v v9, v0
; CHECK-RV64-NEXT: csrr a4, vlenb
; CHECK-RV64-NEXT: sub a3, a2, a4
-; CHECK-RV64-NEXT: sltu a5, a2, a3
+; CHECK-RV64-NEXT: sltu a5, a4, a2
; CHECK-RV64-NEXT: addi a5, a5, -1
; CHECK-RV64-NEXT: and a3, a5, a3
; CHECK-RV64-NEXT: bltu a2, a4, .LBB55_2
@@ -647,19 +647,19 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 %stride, i32 zeroext %evl) {
; CHECK-RV32-LABEL: strided_load_nxv16f64_allones_mask:
; CHECK-RV32: # %bb.0:
-; CHECK-RV32-NEXT: csrr a4, vlenb
-; CHECK-RV32-NEXT: sub a2, a3, a4
-; CHECK-RV32-NEXT: sltu a5, a3, a2
+; CHECK-RV32-NEXT: csrr a2, vlenb
+; CHECK-RV32-NEXT: sub a4, a3, a2
+; CHECK-RV32-NEXT: sltu a5, a2, a3
; CHECK-RV32-NEXT: addi a5, a5, -1
-; CHECK-RV32-NEXT: and a2, a5, a2
-; CHECK-RV32-NEXT: bltu a3, a4, .LBB56_2
+; CHECK-RV32-NEXT: and a4, a5, a4
+; CHECK-RV32-NEXT: bltu a3, a2, .LBB56_2
; CHECK-RV32-NEXT: # %bb.1:
-; CHECK-RV32-NEXT: mv a3, a4
+; CHECK-RV32-NEXT: mv a3, a2
; CHECK-RV32-NEXT: .LBB56_2:
-; CHECK-RV32-NEXT: mul a4, a3, a1
-; CHECK-RV32-NEXT: add a4, a0, a4
-; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1
+; CHECK-RV32-NEXT: mul a2, a3, a1
+; CHECK-RV32-NEXT: add a2, a0, a2
+; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT: vlse64.v v16, (a2), a1
; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1
; CHECK-RV32-NEXT: ret
@@ -668,7 +668,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: csrr a4, vlenb
; CHECK-RV64-NEXT: sub a3, a2, a4
-; CHECK-RV64-NEXT: sltu a5, a2, a3
+; CHECK-RV64-NEXT: sltu a5, a4, a2
; CHECK-RV64-NEXT: addi a5, a5, -1
; CHECK-RV64-NEXT: and a3, a5, a3
; CHECK-RV64-NEXT: bltu a2, a4, .LBB56_2
@@ -703,7 +703,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV32-NEXT: mv a6, a7
; CHECK-RV32-NEXT: .LBB57_2:
; CHECK-RV32-NEXT: sub a5, a6, a2
-; CHECK-RV32-NEXT: sltu t0, a6, a5
+; CHECK-RV32-NEXT: sltu t0, a2, a6
; CHECK-RV32-NEXT: addi t0, t0, -1
; CHECK-RV32-NEXT: and t0, t0, a5
; CHECK-RV32-NEXT: mv a5, a6
@@ -713,15 +713,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV32-NEXT: .LBB57_4:
; CHECK-RV32-NEXT: mul t1, a5, a1
; CHECK-RV32-NEXT: srli t2, a2, 3
-; CHECK-RV32-NEXT: sub a7, a3, a7
; CHECK-RV32-NEXT: vsetvli t3, zero, e8, mf4, ta, ma
; CHECK-RV32-NEXT: vslidedown.vx v0, v8, t2
+; CHECK-RV32-NEXT: sub t2, a3, a7
; CHECK-RV32-NEXT: add t1, a0, t1
; CHECK-RV32-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV32-NEXT: sltu a3, a3, a7
+; CHECK-RV32-NEXT: sltu a3, a7, a3
; CHECK-RV32-NEXT: addi a3, a3, -1
-; CHECK-RV32-NEXT: and a3, a3, a7
+; CHECK-RV32-NEXT: and a3, a3, t2
; CHECK-RV32-NEXT: bltu a3, a2, .LBB57_6
; CHECK-RV32-NEXT: # %bb.5:
; CHECK-RV32-NEXT: mv a3, a2
@@ -751,7 +751,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV64-NEXT: mv a6, a7
; CHECK-RV64-NEXT: .LBB57_2:
; CHECK-RV64-NEXT: sub a5, a6, a4
-; CHECK-RV64-NEXT: sltu t0, a6, a5
+; CHECK-RV64-NEXT: sltu t0, a4, a6
; CHECK-RV64-NEXT: addi t0, t0, -1
; CHECK-RV64-NEXT: and t0, t0, a5
; CHECK-RV64-NEXT: mv a5, a6
@@ -761,15 +761,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV64-NEXT: .LBB57_4:
; CHECK-RV64-NEXT: mul t1, a5, a1
; CHECK-RV64-NEXT: srli t2, a4, 3
-; CHECK-RV64-NEXT: sub a7, a2, a7
; CHECK-RV64-NEXT: vsetvli t3, zero, e8, mf4, ta, ma
; CHECK-RV64-NEXT: vslidedown.vx v0, v8, t2
+; CHECK-RV64-NEXT: sub t2, a2, a7
; CHECK-RV64-NEXT: add t1, a0, t1
; CHECK-RV64-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-RV64-NEXT: vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV64-NEXT: sltu a2, a2, a7
+; CHECK-RV64-NEXT: sltu a2, a7, a2
; CHECK-RV64-NEXT: addi a2, a2, -1
-; CHECK-RV64-NEXT: and a2, a2, a7
+; CHECK-RV64-NEXT: and a2, a2, t2
; CHECK-RV64-NEXT: bltu a2, a4, .LBB57_6
; CHECK-RV64-NEXT: # %bb.5:
; CHECK-RV64-NEXT: mv a2, a4
@@ -861,10 +861,10 @@ define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, p
; CHECK-RV32: # %bb.0:
; CHECK-RV32-NEXT: csrr a1, vlenb
; CHECK-RV32-NEXT: srli a2, a1, 3
-; CHECK-RV32-NEXT: sub a1, a2, a1
-; CHECK-RV32-NEXT: sltu a3, a2, a1
-; CHECK-RV32-NEXT: addi a3, a3, -1
-; CHECK-RV32-NEXT: and a1, a3, a1
+; CHECK-RV32-NEXT: sub a3, a2, a1
+; CHECK-RV32-NEXT: sltu a1, a1, a2
+; CHECK-RV32-NEXT: addi a1, a1, -1
+; CHECK-RV32-NEXT: and a1, a1, a3
; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v24, (a0), zero
; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 2ec89888af077..12ff5e98c00e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -492,12 +492,12 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t
; CHECK-NEXT: sub a5, a2, a3
+; CHECK-NEXT: sltu a2, a3, a2
; CHECK-NEXT: mul a4, a4, a1
; CHECK-NEXT: srli a3, a3, 3
-; CHECK-NEXT: sltu a2, a2, a5
+; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a5
; CHECK-NEXT: add a0, a0, a4
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -508,25 +508,45 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
}
define void @strided_store_nxv16f64_allones_mask(<vscale x 16 x double> %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
-; CHECK-LABEL: strided_store_nxv16f64_allones_mask:
-; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB47_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a3, a4
-; CHECK-NEXT: .LBB47_2:
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vsse64.v v8, (a0), a1
-; CHECK-NEXT: sub a4, a2, a4
-; CHECK-NEXT: mul a3, a3, a1
-; CHECK-NEXT: sltu a2, a2, a4
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a4
-; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vsse64.v v16, (a0), a1
-; CHECK-NEXT: ret
+; CHECK-RV32-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: csrr a3, vlenb
+; CHECK-RV32-NEXT: mv a4, a2
+; CHECK-RV32-NEXT: bltu a2, a3, .LBB47_2
+; CHECK-RV32-NEXT: # %bb.1:
+; CHECK-RV32-NEXT: mv a4, a3
+; CHECK-RV32-NEXT: .LBB47_2:
+; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1
+; CHECK-RV32-NEXT: sub a5, a2, a3
+; CHECK-RV32-NEXT: sltu a2, a3, a2
+; CHECK-RV32-NEXT: mul a3, a4, a1
+; CHECK-RV32-NEXT: addi a2, a2, -1
+; CHECK-RV32-NEXT: and a2, a2, a5
+; CHECK-RV32-NEXT: add a0, a0, a3
+; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: csrr a4, vlenb
+; CHECK-RV64-NEXT: mv a3, a2
+; CHECK-RV64-NEXT: bltu a2, a4, .LBB47_2
+; CHECK-RV64-NEXT: # %bb.1:
+; CHECK-RV64-NEXT: mv a3, a4
+; CHECK-RV64-NEXT: .LBB47_2:
+; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1
+; CHECK-RV64-NEXT: sub a5, a2, a4
+; CHECK-RV64-NEXT: sltu a2, a4, a2
+; CHECK-RV64-NEXT: mul a3, a3, a1
+; CHECK-RV64-NEXT: addi a2, a2, -1
+; CHECK-RV64-NEXT: and a2, a2, a5
+; CHECK-RV64-NEXT: add a0, a0, a3
+; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1
+; CHECK-RV64-NEXT: ret
call void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32(<vscale x 16 x double> %v, ptr %ptr, i32 %stride, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret void
}
@@ -554,19 +574,19 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a1), a2, v0.t
; CHECK-NEXT: sub a0, a5, a4
-; CHECK-NEXT: mul a7, a7, a2
-; CHECK-NEXT: srli t0, a4, 3
-; CHECK-NEXT: sub a6, a3, a6
+; CHECK-NEXT: sub t0, a3, a6
+; CHECK-NEXT: sltu a3, a6, a3
+; CHECK-NEXT: srli a6, a4, 3
; CHECK-NEXT: vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v7, t0
-; CHECK-NEXT: sltu t0, a5, a0
+; CHECK-NEXT: vslidedown.vx v0, v7, a6
+; CHECK-NEXT: sltu a6, a4, a5
+; CHECK-NEXT: mul a7, a7, a2
+; CHECK-NEXT: addi a6, a6, -1
; CHECK-NEXT: add a7, a1, a7
-; CHECK-NEXT: sltu a3, a3, a6
-; CHECK-NEXT: addi t0, t0, -1
; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and t0, t0, a0
-; CHECK-NEXT: and a0, a3, a6
-; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT: and a6, a6, a0
+; CHECK-NEXT: and a0, a3, t0
+; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v16, (a7), a2, v0.t
; CHECK-NEXT: bltu a0, a4, .LBB48_6
; CHECK-NEXT: # %bb.5:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index c64b755051898..6378135654ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -551,7 +551,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -1306,7 +1306,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1329,7 +1329,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1354,11 +1354,11 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
; RV32-NEXT: srli a1, a0, 2
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; RV32-NEXT: vadd.vi v8, v8, -1, v0.t
-; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a2, a0, 1
+; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: sltu a0, a0, a1
+; RV32-NEXT: sub a1, a0, a2
+; RV32-NEXT: sltu a0, a2, a0
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a1
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -1374,7 +1374,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
; RV64-NEXT: slli a1, a0, 1
; RV64-NEXT: vslidedown.vx v0, v0, a2
; RV64-NEXT: sub a2, a0, a1
-; RV64-NEXT: sltu a3, a0, a2
+; RV64-NEXT: sltu a3, a1, a0
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: and a2, a3, a2
; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index e0fcd4009ad2e..7d97f353a22b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -847,7 +847,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -869,7 +869,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index d03b068e11ea8..42b1da9d97f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -928,13 +928,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; ZVFH-NEXT: slli a1, a2, 1
; ZVFH-NEXT: srli a2, a2, 2
; ZVFH-NEXT: sub a3, a0, a1
+; ZVFH-NEXT: sltu a4, a1, a0
; ZVFH-NEXT: vslidedown.vx v0, v0, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: addi a3, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: addi a4, a4, -1
+; ZVFH-NEXT: and a3, a4, a3
+; ZVFH-NEXT: addi a2, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -977,13 +977,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1026,13 +1026,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: sltu a4, a1, a0
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1079,14 +1079,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFH-NEXT: slli a1, a2, 1
; ZVFH-NEXT: srli a2, a2, 2
; ZVFH-NEXT: sub a3, a0, a1
-; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT: sltu a4, a1, a0
+; ZVFH-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFH-NEXT: vslidedown.vx v0, v24, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: addi a3, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: addi a4, a4, -1
+; ZVFH-NEXT: and a3, a4, a3
+; ZVFH-NEXT: addi a2, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1128,14 +1128,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1177,14 +1177,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1216,130 +1216,6 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
}
define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: .cfi_def_cfa_offset 16
-; ZVFH-NEXT: csrr a1, vlenb
-; ZVFH-NEXT: slli a1, a1, 4
-; ZVFH-NEXT: sub sp, sp, a1
-; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFH-NEXT: vmv1r.v v7, v0
-; ZVFH-NEXT: fmv.x.h a1, fa0
-; ZVFH-NEXT: csrr a2, vlenb
-; ZVFH-NEXT: vmv.v.x v24, a1
-; ZVFH-NEXT: slli a1, a2, 1
-; ZVFH-NEXT: srli a2, a2, 2
-; ZVFH-NEXT: sub a3, a0, a1
-; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT: vslidedown.vx v0, v0, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: csrr a3, vlenb
-; ZVFH-NEXT: slli a3, a3, 3
-; ZVFH-NEXT: add a3, sp, a3
-; ZVFH-NEXT: addi a3, a3, 16
-; ZVFH-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v24, v16, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT: bltu a0, a1, .LBB24_2
-; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: mv a0, a1
-; ZVFH-NEXT: .LBB24_2:
-; ZVFH-NEXT: vmv1r.v v0, v7
-; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: .cfi_def_cfa sp, 16
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: .cfi_def_cfa_offset 0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB24_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB24_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -1355,14 +1231,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8alt, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1397,108 +1273,6 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: .cfi_def_cfa_offset 16
-; ZVFH-NEXT: csrr a1, vlenb
-; ZVFH-NEXT: slli a1, a1, 3
-; ZVFH-NEXT: sub sp, sp, a1
-; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT: fmv.x.h a1, fa0
-; ZVFH-NEXT: csrr a2, vlenb
-; ZVFH-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFH-NEXT: vmset.m v24
-; ZVFH-NEXT: vmv.v.x v16, a1
-; ZVFH-NEXT: slli a1, a2, 1
-; ZVFH-NEXT: srli a2, a2, 2
-; ZVFH-NEXT: sub a3, a0, a1
-; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT: vslidedown.vx v0, v24, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: addi a3, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT: bltu a0, a1, .LBB25_2
-; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: mv a0, a1
-; ZVFH-NEXT: .LBB25_2:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v16, v24
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: .cfi_def_cfa sp, 16
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: .cfi_def_cfa_offset 0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB25_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB25_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -1514,14 +1288,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8alt, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2351,13 +2125,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2400,13 +2174,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: sltu a4, a1, a0
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2459,14 +2233,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2508,14 +2282,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2553,68 +2327,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vfadd.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB50_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB50_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -2631,17 +2343,17 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: csrr a3, vlenb
-; ZVFBFA-NEXT: slli a3, a3, 3
-; ZVFBFA-NEXT: add a3, sp, a3
-; ZVFBFA-NEXT: addi a3, a3, 16
-; ZVFBFA-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: slli a2, a2, 3
+; ZVFBFA-NEXT: add a2, sp, a2
+; ZVFBFA-NEXT: addi a2, a2, 16
+; ZVFBFA-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2689,57 +2401,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vfadd.vf v8, v8, fa0
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB51_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB51_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -2756,14 +2417,14 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index e9d7137919ac9..5f8603067d82a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
}
define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB22_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB22_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmset.m v24
-; CHECK-NEXT: vmv.v.x v16, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB23_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB23_2:
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfdiv.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB46_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfdiv.vf v8, v8, fa0
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB47_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index c25a0d47c5c53..03cbe8c5d555c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -600,16 +600,16 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
; CHECK-NEXT: slli a0, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a1, a0
+; CHECK-NEXT: sltu a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a1, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -716,17 +716,17 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
; CHECK-NEXT: slli a0, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a1, a0
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a0, a1
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a1, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -815,7 +815,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
; CHECK-NEXT: srli a3, a3, 2
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: sltu a3, a0, a4
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: csrr a4, vlenb
@@ -912,124 +912,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
}
define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_commute:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v3, v0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmv.v.x v8, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: slli a2, a2, 1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v4, v8, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
-; CHECK-NEXT: vmv1r.v v0, v3
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT: vmv4r.v v12, v4
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -1058,7 +940,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v8, a3
-; CHECK-NEXT: sltu a3, a0, a4
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: csrr a4, vlenb
@@ -1161,107 +1043,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
}
define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked_commute:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmset.m v8
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v8, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: slli a2, a2, 1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB35_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB35_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v0, v24, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v16, v0
-; CHECK-NEXT: vmv8r.v v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -2049,16 +1830,16 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a0, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a1, a0
+; ZVFHMIN-NEXT: sltu a4, a0, a1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a1, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2172,17 +1953,17 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: slli a0, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a1, a0
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a0, a1
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a1, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2277,7 +2058,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -2378,153 +2159,34 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmadd.vf v8, fa0, v16, v0.t
+; ZVFH-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_commute:
+; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: slli a1, a1, 2
; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB69_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB69_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfmadd.vf v8, fa0, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v24, v8
-; ZVFHMIN-NEXT: fmv.x.h a2, fa0
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT: vmv8r.v v24, v8
+; ZVFHMIN-NEXT: fmv.x.h a2, fa0
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: vmset.m v8
; ZVFHMIN-NEXT: slli a1, a3, 1
@@ -2532,7 +2194,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -2640,108 +2302,6 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB71_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB71_2:
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -3428,14 +2988,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: slli a5, a1, 3
; CHECK-NEXT: sub a6, a4, a1
; CHECK-NEXT: add a7, a2, a5
-; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: vl8re64.v v8, (a7)
; CHECK-NEXT: csrr a7, vlenb
; CHECK-NEXT: slli a7, a7, 3
; CHECK-NEXT: add a7, sp, a7
; CHECK-NEXT: addi a7, a7, 16
; CHECK-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: sltu a7, a4, a6
+; CHECK-NEXT: sltu a7, a1, a4
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: vl8re64.v v8, (a5)
; CHECK-NEXT: csrr a5, vlenb
@@ -3563,7 +3123,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a4, a5
+; CHECK-NEXT: sltu a3, a1, a4
; CHECK-NEXT: vl8re64.v v8, (a2)
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
@@ -7976,35 +7536,36 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v16, v8
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT: vmv8r.v v24, v8
+; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
; ZVFHMIN-NEXT: lui a2, 8
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a0, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: sub a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT: sltu a3, a0, a1
; ZVFHMIN-NEXT: addi a3, a3, -1
+; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vxor.vx v16, v8, a2, v0.t
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: vmv1r.v v0, v6
+; ZVFHMIN-NEXT: vmv4r.v v8, v16
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -8013,37 +7574,37 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a1, a0, .LBB280_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
; ZVFHMIN-NEXT: .LBB280_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
@@ -8052,17 +7613,17 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vmv.v.v v16, v8
; ZVFHMIN-NEXT: vmv4r.v v12, v4
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
@@ -8114,10 +7675,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: sub a4, a1, a0
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
+; ZVFHMIN-NEXT: sltu a3, a0, a1
+; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: addi a2, sp, 16
@@ -8229,7 +7790,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -8338,128 +7899,6 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmsub.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB283_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB283_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -8498,7 +7937,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -8606,9 +8045,75 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmsub.vf v8, fa0, v16
+; ZVFH-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT: vmv.v.v v8, v16
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
@@ -8616,30 +8121,31 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv1r.v v3, v0
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: mv a4, a1
; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, a1, a4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
+; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -8654,7 +8160,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -8662,27 +8168,39 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB285_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB290_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB285_2:
+; ZVFHMIN-NEXT: .LBB290_2:
+; ZVFHMIN-NEXT: vmv1r.v v0, v3
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -8690,1046 +8208,21 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT: vmv.v.v v8, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 4
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB286_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB286_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB287_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB287_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB288_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB288_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB289_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB289_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a4, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB290_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB290_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a4, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB291_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a2, fa0
-; ZVFHMIN-NEXT: lui a1, 8
-; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v7
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v24, v8, a1
-; ZVFHMIN-NEXT: vxor.vx v8, v16, a1
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a0, a1
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: mv a5, a4
-; ZVFHMIN-NEXT: slli a4, a4, 1
-; ZVFHMIN-NEXT: add a4, a4, a5
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 4
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 5
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB292_2:
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a2, fa0
-; ZVFHMIN-NEXT: lui a1, 8
-; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a0, a1
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 4
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: mv a5, a4
-; ZVFHMIN-NEXT: slli a4, a4, 1
-; ZVFHMIN-NEXT: add a4, a4, a5
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB293_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB293_2:
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vmv.v.v v16, v8
+; ZVFHMIN-NEXT: vmv4r.v v12, v4
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -9739,20 +8232,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
@@ -9760,30 +8253,31 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv1r.v v3, v0
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: mv a4, a1
; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, a1, a4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -9814,10 +8308,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB294_2:
+; ZVFHMIN-NEXT: .LBB291_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
@@ -9870,249 +8364,134 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB295_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB295_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: slli a1, a1, 2
; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT: fmv.x.h a2, fa0
+; ZVFHMIN-NEXT: lui a1, 8
; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
; ZVFHMIN-NEXT: vmset.m v7
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT: vxor.vx v24, v8, a1
+; ZVFHMIN-NEXT: vxor.vx v8, v16, a1
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sub a4, a0, a1
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 4
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 5
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v8, a2
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB296_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB296_2:
+; ZVFHMIN-NEXT: .LBB292_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
; ZVFHMIN-NEXT: addi sp, sp, 16
@@ -10120,20 +8499,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
@@ -10141,80 +8520,74 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: fmv.x.h a2, fa0
+; ZVFHMIN-NEXT: lui a1, 8
; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v7
+; ZVFHMIN-NEXT: vmset.m v24
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT: vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a1
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sub a4, a0, a1
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 4
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v8, a2
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB297_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB293_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB297_2:
+; ZVFHMIN-NEXT: .LBB293_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -10222,15 +8595,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT: vfmadd.vv v16, v0, v24
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10240,79 +8618,66 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT: vmv.v.v v8, v16
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
+; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 4
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10320,30 +8685,39 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB298_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB298_2:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB294_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
@@ -10353,27 +8727,19 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vmv.v.v v16, v8
; ZVFHMIN-NEXT: vmv4r.v v12, v4
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10381,68 +8747,68 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
+; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10450,38 +8816,38 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB299_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB295_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB299_2:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB295_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
@@ -10490,15 +8856,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vmv.v.v v16, v8
@@ -10512,68 +8878,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
+; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT: vmset.m v7
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 4
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10589,16 +8956,16 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB300_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB296_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB300_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB296_2:
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
@@ -10607,7 +8974,7 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -10615,14 +8982,14 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10630,68 +8997,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
+; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT: vmset.m v7
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 4
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10699,33 +9067,33 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB301_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB297_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB301_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB297_2:
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -10733,14 +9101,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT: vmv8r.v v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10748,6 +9117,61 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT: vmv.v.v v8, v16
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
%negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -10781,7 +9205,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -10912,7 +9336,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -11038,7 +9462,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -11168,7 +9592,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -11296,11 +9720,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -11425,11 +9849,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -11560,11 +9984,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
@@ -11679,11 +10103,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 394887fee67fc..803680dd09061 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index 5c5542619b6ef..43b62bb7f9f76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index eb77b4b4dbac3..39f0163de048c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -489,13 +489,13 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -548,14 +548,14 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -592,68 +592,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmul.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB22_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -666,57 +604,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmul.vf v8, v8, fa0
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB23_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 03de2c97e685c..37ee3ad000854 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1096,14 +1096,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: slli a5, a1, 3
; CHECK-NEXT: sub a6, a4, a1
; CHECK-NEXT: add a7, a2, a5
-; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: vl8re64.v v8, (a7)
; CHECK-NEXT: csrr a7, vlenb
; CHECK-NEXT: slli a7, a7, 3
; CHECK-NEXT: add a7, sp, a7
; CHECK-NEXT: addi a7, a7, 16
; CHECK-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: sltu a7, a4, a6
+; CHECK-NEXT: sltu a7, a1, a4
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: vl8re64.v v8, (a5)
; CHECK-NEXT: csrr a5, vlenb
@@ -1217,7 +1217,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a4, a5
+; CHECK-NEXT: sltu a3, a1, a4
; CHECK-NEXT: vl8re64.v v8, (a2)
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index 96fbe3f6ff025..a78fea1ef3110 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -799,7 +799,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -821,7 +821,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 458795db7965d..c759f2b48f53f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -93,7 +93,7 @@ define <vscale x 32 x float> @vfpext_nxv32f16_nxv32f32(<vscale x 32 x half> %a,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 7127d10e67dbc..5a0e0e8004af8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 07b58ed057508..03c5f7eed3fc0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 4177672b3a306..0f78e035e39d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -92,7 +92,7 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -135,11 +135,11 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: slli a3, a1, 1
; CHECK-NEXT: add a6, a0, a4
; CHECK-NEXT: sub a0, a2, a3
-; CHECK-NEXT: sltu a4, a2, a0
+; CHECK-NEXT: sltu a4, a3, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a0, a4, a0
; CHECK-NEXT: sub a4, a0, a1
-; CHECK-NEXT: sltu a7, a0, a4
+; CHECK-NEXT: sltu a7, a1, a0
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: and a4, a7, a4
; CHECK-NEXT: srli a7, a1, 2
@@ -162,7 +162,7 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: mv a2, a3
; CHECK-NEXT: .LBB8_4:
; CHECK-NEXT: sub a0, a2, a1
-; CHECK-NEXT: sltu a3, a2, a0
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a0, a3, a0
; CHECK-NEXT: vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 451b13edb794e..a77b8a6905f71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -161,7 +161,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: and a3, a4, a3
@@ -196,7 +196,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v16, a2
@@ -437,7 +437,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: sltu a4, a0, a3
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: and a3, a4, a3
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: sltu a4, a0, a3
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v16, a2
@@ -715,7 +715,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <v
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -737,7 +737,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64_unmasked(<vscale x 16 x double
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 6637aced3cdac..ce30d9257cb02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
}
define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB22_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB22_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmset.m v24
-; CHECK-NEXT: vmv.v.x v16, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB23_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB23_2:
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfsub.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB46_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfsub.vf v8, v8, fa0
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB47_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 3cf464247250a..df4b731015243 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmax.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index e755d099df4a8..9b5e83f94e5fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmaxu.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 961f63cbfbc95..1816b07c49c6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmin.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 631799d24e14c..608790009bdb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vminu.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index c96a7d774a5d5..65d37bfb31916 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -700,17 +700,17 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
; CHECK-NEXT: addi a3, sp, 64
; CHECK-NEXT: li a4, -1
; CHECK-NEXT: sub a5, a0, a2
-; CHECK-NEXT: add a6, a0, a3
-; CHECK-NEXT: sltu a0, a0, a5
-; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sltu a6, a2, a0
+; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: add a2, a3, a2
; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a5, a6, a5
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT: vsse8.v v8, (a6), a4
-; CHECK-NEXT: sub a6, a6, a1
-; CHECK-NEXT: and a0, a0, a5
-; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vsse8.v v16, (a6), a4
+; CHECK-NEXT: vsse8.v v8, (a0), a4
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: vsetvli zero, a5, e8, m8, ta, ma
+; CHECK-NEXT: vsse8.v v16, (a0), a4
; CHECK-NEXT: vle8.v v16, (a2)
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
index b8b2ba7c5e5d3..aeee1fa8215f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
@@ -727,7 +727,7 @@ define <vscale x 32 x i32> @vp_splat_nxv32i32(i32 %val, <vscale x 32 x i1> %m, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index b83ddce61f44d..3d025a29e6725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -470,61 +470,61 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
; CHECK-LABEL: test_vp_splice_nxv16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a5, a4, 1
+; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: slli a1, a4, 3
-; CHECK-NEXT: slli a7, a4, 1
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: add a5, a0, a1
-; CHECK-NEXT: mv a6, a2
-; CHECK-NEXT: bltu a2, a7, .LBB22_2
+; CHECK-NEXT: mv a7, a2
+; CHECK-NEXT: bltu a2, a5, .LBB22_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a6, a7
+; CHECK-NEXT: mv a7, a5
; CHECK-NEXT: .LBB22_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a7, vlenb
-; CHECK-NEXT: slli a7, a7, 5
-; CHECK-NEXT: sub sp, sp, a7
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 5
+; CHECK-NEXT: sub sp, sp, a5
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: vl8re64.v v24, (a5)
-; CHECK-NEXT: slli a5, a6, 3
+; CHECK-NEXT: add a5, a0, a1
+; CHECK-NEXT: slli a7, a7, 3
; CHECK-NEXT: addi a6, sp, 64
-; CHECK-NEXT: add a5, a6, a5
-; CHECK-NEXT: mv a7, a2
+; CHECK-NEXT: mv t0, a2
; CHECK-NEXT: bltu a2, a4, .LBB22_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv a7, a4
+; CHECK-NEXT: mv t0, a4
; CHECK-NEXT: .LBB22_4:
+; CHECK-NEXT: vl8re64.v v24, (a5)
+; CHECK-NEXT: add a5, a6, a7
; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a6)
; CHECK-NEXT: sub a0, a2, a4
-; CHECK-NEXT: add a6, a6, a1
-; CHECK-NEXT: sub a7, a3, a4
-; CHECK-NEXT: sltu a2, a2, a0
+; CHECK-NEXT: sltu a2, a4, a2
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a0
-; CHECK-NEXT: sltu a0, a3, a7
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a7
-; CHECK-NEXT: add a7, a5, a1
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v16, (a6)
+; CHECK-NEXT: and a0, a2, a0
+; CHECK-NEXT: add a6, a6, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (a7)
+; CHECK-NEXT: vse64.v v16, (a6)
+; CHECK-NEXT: mv a0, a3
; CHECK-NEXT: bltu a3, a4, .LBB22_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a3, a4
+; CHECK-NEXT: mv a0, a4
; CHECK-NEXT: .LBB22_6:
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v0, (a5)
-; CHECK-NEXT: addi a2, sp, 104
-; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v0, (a5)
+; CHECK-NEXT: sub a2, a3, a4
+; CHECK-NEXT: sltu a3, a4, a3
+; CHECK-NEXT: add a5, a5, a1
+; CHECK-NEXT: addi a4, sp, 104
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: add a1, a4, a1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (a5)
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a4)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -537,66 +537,66 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 {
; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: slli a1, a4, 3
-; CHECK-NEXT: slli a7, a4, 1
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: add a5, a0, a1
-; CHECK-NEXT: mv a6, a2
-; CHECK-NEXT: bltu a2, a7, .LBB23_2
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a6, a5, 1
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: slli a1, a5, 3
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: bltu a2, a6, .LBB23_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a6, a7
+; CHECK-NEXT: mv a4, a6
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a7, vlenb
-; CHECK-NEXT: slli a7, a7, 5
-; CHECK-NEXT: sub sp, sp, a7
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: slli a6, a6, 5
+; CHECK-NEXT: sub sp, sp, a6
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: vl8re64.v v24, (a5)
-; CHECK-NEXT: slli a5, a6, 3
+; CHECK-NEXT: add a6, a0, a1
+; CHECK-NEXT: slli a4, a4, 3
; CHECK-NEXT: addi a7, sp, 64
-; CHECK-NEXT: add a6, a7, a5
; CHECK-NEXT: mv t0, a2
-; CHECK-NEXT: bltu a2, a4, .LBB23_4
+; CHECK-NEXT: bltu a2, a5, .LBB23_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv t0, a4
+; CHECK-NEXT: mv t0, a5
; CHECK-NEXT: .LBB23_4:
+; CHECK-NEXT: vl8re64.v v24, (a6)
+; CHECK-NEXT: add a6, a7, a4
; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a7)
-; CHECK-NEXT: sub a0, a2, a4
-; CHECK-NEXT: add a7, a7, a1
-; CHECK-NEXT: sub t0, a3, a4
-; CHECK-NEXT: sltu a2, a2, a0
+; CHECK-NEXT: sub a0, a2, a5
+; CHECK-NEXT: sltu a2, a5, a2
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a0
-; CHECK-NEXT: sltu a0, a3, t0
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, t0
-; CHECK-NEXT: add t0, a6, a1
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v16, (a7)
+; CHECK-NEXT: and a0, a2, a0
+; CHECK-NEXT: add a7, a7, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (t0)
-; CHECK-NEXT: bltu a3, a4, .LBB23_6
+; CHECK-NEXT: vse64.v v16, (a7)
+; CHECK-NEXT: mv a0, a3
+; CHECK-NEXT: bltu a3, a5, .LBB23_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a3, a4
+; CHECK-NEXT: mv a0, a5
; CHECK-NEXT: .LBB23_6:
-; CHECK-NEXT: li a2, 8
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v0, (a6)
-; CHECK-NEXT: bltu a5, a2, .LBB23_8
+; CHECK-NEXT: sub a2, a3, a5
+; CHECK-NEXT: sltu a3, a5, a3
+; CHECK-NEXT: add a5, a6, a1
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: li a3, 8
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (a5)
+; CHECK-NEXT: bltu a4, a3, .LBB23_8
; CHECK-NEXT: # %bb.7:
-; CHECK-NEXT: li a5, 8
+; CHECK-NEXT: li a4, 8
; CHECK-NEXT: .LBB23_8:
-; CHECK-NEXT: sub a2, a6, a5
+; CHECK-NEXT: sub a2, a6, a4
; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index a075bba81d3c6..fb8480ee5f471 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -254,7 +254,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
; RV32-NEXT: slli a2, a3, 1
; RV32-NEXT: srli a3, a3, 2
; RV32-NEXT: sub a4, a1, a2
-; RV32-NEXT: sltu a5, a1, a4
+; RV32-NEXT: sltu a5, a2, a1
; RV32-NEXT: addi a5, a5, -1
; RV32-NEXT: and a4, a5, a4
; RV32-NEXT: vslidedown.vx v0, v0, a3
@@ -281,12 +281,12 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
; RV64-NEXT: slli a3, a2, 1
; RV64-NEXT: srli a4, a2, 2
; RV64-NEXT: sub a5, a1, a3
+; RV64-NEXT: sltu a6, a3, a1
; RV64-NEXT: vslidedown.vx v13, v0, a4
-; RV64-NEXT: sltu a4, a1, a5
-; RV64-NEXT: addi a4, a4, -1
-; RV64-NEXT: and a5, a4, a5
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: and a5, a6, a5
; RV64-NEXT: sub a4, a5, a2
-; RV64-NEXT: sltu a6, a5, a4
+; RV64-NEXT: sltu a6, a2, a5
; RV64-NEXT: addi a6, a6, -1
; RV64-NEXT: and a6, a6, a4
; RV64-NEXT: srli a4, a2, 3
@@ -310,7 +310,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
; RV64-NEXT: mv a1, a3
; RV64-NEXT: .LBB12_4:
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a5, a1, a3
+; RV64-NEXT: sltu a5, a2, a1
; RV64-NEXT: addi a5, a5, -1
; RV64-NEXT: and a3, a5, a3
; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
@@ -2367,7 +2367,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: srli a3, a1, 3
; RV32-NEXT: vslidedown.vx v0, v0, a3
-; RV32-NEXT: sltu a3, a0, a2
+; RV32-NEXT: sltu a3, a1, a0
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2390,7 +2390,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
; RV64-NEXT: sub a2, a0, a1
; RV64-NEXT: srli a3, a1, 3
; RV64-NEXT: vslidedown.vx v0, v0, a3
-; RV64-NEXT: sltu a3, a0, a2
+; RV64-NEXT: sltu a3, a1, a0
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: and a2, a3, a2
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2422,8 +2422,8 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a1, a2
+; RV32-NEXT: sltu a1, a2, a1
; RV32-NEXT: srli a2, a2, 3
-; RV32-NEXT: sltu a1, a1, a3
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
@@ -2443,7 +2443,7 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
; RV64-NEXT: srli a4, a2, 3
; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a4
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
@@ -2479,8 +2479,8 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a1, a2
+; RV32-NEXT: sltu a1, a2, a1
; RV32-NEXT: srli a2, a2, 3
-; RV32-NEXT: sltu a1, a1, a3
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
@@ -2500,7 +2500,7 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
; RV64-NEXT: srli a4, a2, 3
; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a4
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
@@ -2537,8 +2537,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a1, a2
+; RV32-NEXT: sltu a1, a2, a1
; RV32-NEXT: srli a2, a2, 3
-; RV32-NEXT: sltu a1, a1, a3
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
@@ -2561,8 +2561,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a1, a2
+; RV64-NEXT: sltu a1, a2, a1
; RV64-NEXT: srli a2, a2, 3
-; RV64-NEXT: sltu a1, a1, a3
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index 2ece316c7e54a..4d2ba719d63ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -456,15 +456,15 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
; CHECK-NEXT: vmv1r.v v8, v0
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: slli a4, a2, 3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: srli a5, a2, 3
; CHECK-NEXT: vslidedown.vx v0, v0, a5
-; CHECK-NEXT: sltu a5, a1, a3
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a3, a5, a3
-; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: slli a5, a2, 3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a4), v0.t
+; CHECK-NEXT: vle64.v v16, (a5), v0.t
; CHECK-NEXT: bltu a1, a2, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a1, a2
@@ -496,18 +496,18 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
; CHECK-NEXT: mv a4, a5
; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: sub a6, a4, a3
-; CHECK-NEXT: slli a7, a3, 3
-; CHECK-NEXT: sltu t0, a4, a6
-; CHECK-NEXT: addi t0, t0, -1
-; CHECK-NEXT: and a6, t0, a6
-; CHECK-NEXT: srli t0, a3, 3
-; CHECK-NEXT: sub t1, a2, a5
-; CHECK-NEXT: add a5, a0, a7
-; CHECK-NEXT: sltu a2, a2, t1
+; CHECK-NEXT: sltu a7, a3, a4
+; CHECK-NEXT: sub t0, a2, a5
+; CHECK-NEXT: sltu a2, a5, a2
+; CHECK-NEXT: slli a5, a3, 3
+; CHECK-NEXT: addi a7, a7, -1
+; CHECK-NEXT: and a6, a7, a6
+; CHECK-NEXT: srli a7, a3, 3
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, t1
-; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v8, t0
+; CHECK-NEXT: and a2, a2, t0
+; CHECK-NEXT: vsetvli t0, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a7
; CHECK-NEXT: bltu a2, a3, .LBB45_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a2, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index f92ee37051840..01edd0f912bd6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -198,22 +198,22 @@ define <vscale x 128 x i1> @vpmerge_nxv128i1(<vscale x 128 x i1> %va, <vscale x
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: .LBB7_2:
-; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT: sub a3, a0, a2
+; CHECK-NEXT: sltu a0, a2, a0
+; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma
; CHECK-NEXT: vmv.v.i v16, 0
-; CHECK-NEXT: sub a2, a0, a2
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vim v24, v16, 1, v0
; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
-; CHECK-NEXT: sltu a0, a0, a2
+; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vmv1r.v v0, v4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma
; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
-; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
; CHECK-NEXT: vmsne.vi v24, v8, 0
-; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v5
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
@@ -547,7 +547,7 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: sub a4, a3, a1
; CHECK-NEXT: vl8r.v v24, (a2)
-; CHECK-NEXT: sltu a2, a3, a4
+; CHECK-NEXT: sltu a2, a1, a3
; CHECK-NEXT: vl8r.v v8, (a0)
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a4
@@ -583,7 +583,7 @@ define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma
@@ -611,7 +611,7 @@ define <vscale x 128 x i8> @vpmerge_vi_nxv128i8(<vscale x 128 x i8> %vb, <vscale
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 7e4a60095d7cc..153a0a70d098a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -2193,8 +2193,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t
; RV32-NEXT: sub a2, a1, a0
+; RV32-NEXT: sltu a1, a0, a1
; RV32-NEXT: srli a0, a0, 3
-; RV32-NEXT: sltu a1, a1, a2
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a0
@@ -2226,8 +2226,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t
; RV64-NEXT: sub a0, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a0
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
@@ -2263,8 +2263,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a2, a1
+; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: srli a1, a1, 3
-; RV32-NEXT: sltu a2, a2, a3
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
@@ -2298,8 +2298,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a3
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
@@ -2336,8 +2336,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a2, a1
+; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: srli a1, a1, 3
-; RV32-NEXT: sltu a2, a2, a3
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
@@ -2371,8 +2371,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a3
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
@@ -2410,8 +2410,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a2, a1
+; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: srli a1, a1, 3
-; RV32-NEXT: sltu a2, a2, a3
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
@@ -2435,8 +2435,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a3
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 9fd8b9d23cb5e..3468fda9011a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -373,8 +373,8 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
+; CHECK-NEXT: sltu a1, a2, a1
; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: sltu a1, a1, a3
; CHECK-NEXT: addi a1, a1, -1
; CHECK-NEXT: and a1, a1, a3
; CHECK-NEXT: add a0, a0, a2
@@ -409,20 +409,20 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a1), v0.t
; CHECK-NEXT: sub a0, a5, a3
-; CHECK-NEXT: srli a6, a3, 3
+; CHECK-NEXT: sltu a5, a3, a5
+; CHECK-NEXT: sub a6, a2, a4
+; CHECK-NEXT: sltu a2, a4, a2
+; CHECK-NEXT: srli a4, a3, 3
; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v7, a6
-; CHECK-NEXT: slli a6, a3, 3
-; CHECK-NEXT: sub a4, a2, a4
-; CHECK-NEXT: sltu a5, a5, a0
-; CHECK-NEXT: add a6, a1, a6
-; CHECK-NEXT: sltu a2, a2, a4
+; CHECK-NEXT: vslidedown.vx v0, v7, a4
+; CHECK-NEXT: slli a4, a3, 3
; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: add a4, a1, a4
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a5, a5, a0
-; CHECK-NEXT: and a0, a2, a4
+; CHECK-NEXT: and a0, a2, a6
; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v16, (a6), v0.t
+; CHECK-NEXT: vse64.v v16, (a4), v0.t
; CHECK-NEXT: bltu a0, a3, .LBB36_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index df97f19df7f99..4f31167b80691 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -91,7 +91,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v0, a1
; CHECK-NEXT: sub a1, a0, a2
-; CHECK-NEXT: sltu a3, a0, a1
+; CHECK-NEXT: sltu a3, a2, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a1, a3, a1
; CHECK-NEXT: bltu a0, a2, .LBB6_2
@@ -120,7 +120,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v0, a1
; CHECK-NEXT: sub a1, a0, a2
-; CHECK-NEXT: sltu a3, a0, a1
+; CHECK-NEXT: sltu a3, a2, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a1, a3, a1
; CHECK-NEXT: bltu a0, a2, .LBB7_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 7eea35afe0aa0..f2b84c28db92e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -884,7 +884,7 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v0, a2
; CHECK-NEXT: sub a2, a1, a3
-; CHECK-NEXT: sltu a4, a1, a2
+; CHECK-NEXT: sltu a4, a3, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a2, a4, a2
; CHECK-NEXT: bltu a1, a3, .LBB67_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
index 1e629e9d20530..535a5bdb839e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
@@ -318,7 +318,7 @@ define zeroext i1 @vpreduce_or_nxv128i1(i1 zeroext %s, <vscale x 128 x i1> %v, <
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index 98634fe55de41..b4ed1857652f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -557,7 +557,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -580,7 +580,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -1312,7 +1312,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1335,7 +1335,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index a7d304261f87f..d761b8da7929c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -556,7 +556,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -579,7 +579,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -1311,7 +1311,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1334,7 +1334,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index d1933560f2698..e6ef1bcf73a3d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -308,12 +308,12 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: slli a4, a3, 3
; CHECK-NEXT: slli a1, a3, 1
-; CHECK-NEXT: srli a3, a3, 2
; CHECK-NEXT: add a4, a0, a4
; CHECK-NEXT: sub a5, a2, a1
; CHECK-NEXT: vl8re32.v v24, (a4)
-; CHECK-NEXT: sltu a4, a2, a5
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: srli a3, a3, 2
; CHECK-NEXT: vl8re32.v v8, (a0)
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: and a4, a4, a5
@@ -349,14 +349,14 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
; RV32-NEXT: vl8re32.v v24, (a0)
-; RV32-NEXT: sltu a0, a1, a2
-; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: sub a0, a1, a2
+; RV32-NEXT: sltu a2, a2, a1
+; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; RV32-NEXT: vmerge.vvm v16, v24, v16, v0
; RV32-NEXT: ret
@@ -376,16 +376,16 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a3, a1, 3
; RV64-NEXT: slli a2, a1, 1
-; RV64-NEXT: srli a4, a1, 2
; RV64-NEXT: add a3, a0, a3
-; RV64-NEXT: sub a5, a1, a2
+; RV64-NEXT: sub a4, a1, a2
+; RV64-NEXT: sltu a5, a2, a1
; RV64-NEXT: vl8re32.v v24, (a3)
-; RV64-NEXT: sltu a3, a1, a5
-; RV64-NEXT: addi a3, a3, -1
+; RV64-NEXT: addi a5, a5, -1
+; RV64-NEXT: srli a3, a1, 2
; RV64-NEXT: vl8re32.v v8, (a0)
-; RV64-NEXT: vslidedown.vx v0, v0, a4
-; RV64-NEXT: and a3, a3, a5
-; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV64-NEXT: vslidedown.vx v0, v0, a3
+; RV64-NEXT: and a4, a5, a4
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
; RV64-NEXT: vmerge.vvm v16, v24, v16, v0
; RV64-NEXT: bltu a1, a2, .LBB28_2
; RV64-NEXT: # %bb.1:
@@ -637,10 +637,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: sub a4, a2, a1
+; CHECK-NEXT: sltu a5, a1, a2
; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: sltu a5, a2, a4
-; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: vslidedown.vx v0, v0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 07411b1c7ae08..c8bb009d2c3b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 7f96da141c363..90f1ca0843b02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFH-NEXT: slli a1, a1, 1
; ZVFH-NEXT: vslidedown.vx v0, v0, a2
; ZVFH-NEXT: sub a2, a0, a1
-; ZVFH-NEXT: sltu a3, a0, a2
+; ZVFH-NEXT: sltu a3, a1, a0
; ZVFH-NEXT: addi a3, a3, -1
; ZVFH-NEXT: and a2, a3, a2
; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFHMIN-NEXT: slli a1, a1, 1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a2, a3, a2
; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index 0ac2ef7e251c0..a6a631be9dab4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -598,7 +598,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1353,7 +1353,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1377,7 +1377,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index bde279a4d1f2b..1992b97e0de0d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -572,7 +572,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -596,7 +596,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1351,7 +1351,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1375,7 +1375,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index 0c1ca369521f7..0b07b60da8250 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -147,7 +147,7 @@ define <vscale x 15 x i16> @vtrunc_nxv15i16_nxv15i64(<vscale x 15 x i64> %a, <vs
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -201,7 +201,7 @@ define <vscale x 32 x i7> @vtrunc_nxv32i7_nxv32i32(<vscale x 32 x i32> %a, <vsca
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -233,7 +233,7 @@ define <vscale x 32 x i8> @vtrunc_nxv32i8_nxv32i32(<vscale x 32 x i32> %a, <vsca
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -280,11 +280,11 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: slli a3, a1, 1
; CHECK-NEXT: add a6, a0, a4
; CHECK-NEXT: sub a0, a2, a3
-; CHECK-NEXT: sltu a4, a2, a0
+; CHECK-NEXT: sltu a4, a3, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a0, a4, a0
; CHECK-NEXT: sub a4, a0, a1
-; CHECK-NEXT: sltu a7, a0, a4
+; CHECK-NEXT: sltu a7, a1, a0
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: and a4, a7, a4
; CHECK-NEXT: srli a7, a1, 2
@@ -307,7 +307,7 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: mv a2, a3
; CHECK-NEXT: .LBB17_4:
; CHECK-NEXT: sub a0, a2, a1
-; CHECK-NEXT: sltu a3, a2, a0
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a0, a3, a0
; CHECK-NEXT: vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index c0c749ebf3186..807c2d9fa3ce6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFH-NEXT: slli a1, a1, 1
; ZVFH-NEXT: vslidedown.vx v0, v0, a2
; ZVFH-NEXT: sub a2, a0, a1
-; ZVFH-NEXT: sltu a3, a0, a2
+; ZVFH-NEXT: sltu a3, a1, a0
; ZVFH-NEXT: addi a3, a3, -1
; ZVFH-NEXT: and a2, a3, a2
; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFHMIN-NEXT: slli a1, a1, 1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a2, a3, a2
; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 9713b617b8384..44a1084b4a208 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/usub_sat.ll b/llvm/test/CodeGen/RISCV/usub_sat.ll
index 33056682dcc79..6fcc6bc5f3dcd 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat.ll
@@ -7,10 +7,10 @@
define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
; RV32I-LABEL: func:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func:
@@ -57,10 +57,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: func2:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func2:
@@ -93,18 +93,18 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
; RV32I-LABEL: func16:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func16:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func16:
@@ -125,18 +125,18 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
; RV32I-LABEL: func8:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func8:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func8:
@@ -157,18 +157,18 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
; RV32I-LABEL: func3:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func3:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func3:
diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
index ef6bc022ddc9f..838f2dbe2276d 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -8,10 +8,10 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
; RV32I-LABEL: func32:
; RV32I: # %bb.0:
; RV32I-NEXT: mul a1, a1, a2
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func32:
@@ -65,7 +65,7 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
; RV64I-LABEL: func64:
; RV64I: # %bb.0:
; RV64I-NEXT: sub a1, a0, a2
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sltu a0, a2, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: ret
@@ -106,10 +106,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
; RV32I-NEXT: addi a3, a3, -1
; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: and a1, a1, a3
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func16:
@@ -119,10 +119,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
; RV64I-NEXT: addi a3, a3, -1
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func16:
@@ -153,10 +153,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
; RV32I-NEXT: zext.b a0, a0
; RV32I-NEXT: mul a1, a1, a2
; RV32I-NEXT: zext.b a1, a1
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func8:
@@ -164,10 +164,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
; RV64I-NEXT: zext.b a0, a0
; RV64I-NEXT: mul a1, a1, a2
; RV64I-NEXT: zext.b a1, a1
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func8:
@@ -198,10 +198,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
; RV32I-NEXT: andi a0, a0, 15
; RV32I-NEXT: mul a1, a1, a2
; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func4:
@@ -209,10 +209,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
; RV64I-NEXT: andi a0, a0, 15
; RV64I-NEXT: mul a1, a1, a2
; RV64I-NEXT: andi a1, a1, 15
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func4:
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 62f08d7831dda..0de2cbd76b749 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -937,9 +937,10 @@ entry:
define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
; RV32-LABEL: usubo.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: sw a1, 0(a2)
+; RV32-NEXT: sltu a3, a1, a0
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: sw a0, 0(a2)
+; RV32-NEXT: mv a0, a3
; RV32-NEXT: ret
;
; RV64-LABEL: usubo.i32:
@@ -951,9 +952,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZBA-LABEL: usubo.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: sltu a0, a0, a1
-; RV32ZBA-NEXT: sw a1, 0(a2)
+; RV32ZBA-NEXT: sltu a3, a1, a0
+; RV32ZBA-NEXT: sub a0, a0, a1
+; RV32ZBA-NEXT: sw a0, 0(a2)
+; RV32ZBA-NEXT: mv a0, a3
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: usubo.i32:
@@ -965,9 +967,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZICOND-LABEL: usubo.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: sltu a0, a0, a1
-; RV32ZICOND-NEXT: sw a1, 0(a2)
+; RV32ZICOND-NEXT: sltu a3, a1, a0
+; RV32ZICOND-NEXT: sub a0, a0, a1
+; RV32ZICOND-NEXT: sw a0, 0(a2)
+; RV32ZICOND-NEXT: mv a0, a3
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: usubo.i32:
@@ -987,9 +990,11 @@ entry:
define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
; RV32-LABEL: usubo.i32.constant.rhs:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: addi a2, a0, 2
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: sw a2, 0(a1)
+; RV32-NEXT: addi a2, a0, 1
+; RV32-NEXT: seqz a2, a2
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: sw a0, 0(a1)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: usubo.i32.constant.rhs:
@@ -1001,9 +1006,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
;
; RV32ZBA-LABEL: usubo.i32.constant.rhs:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: addi a2, a0, 2
-; RV32ZBA-NEXT: sltu a0, a0, a2
-; RV32ZBA-NEXT: sw a2, 0(a1)
+; RV32ZBA-NEXT: addi a2, a0, 1
+; RV32ZBA-NEXT: seqz a2, a2
+; RV32ZBA-NEXT: addi a0, a0, 2
+; RV32ZBA-NEXT: sw a0, 0(a1)
+; RV32ZBA-NEXT: mv a0, a2
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: usubo.i32.constant.rhs:
@@ -1015,9 +1022,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
;
; RV32ZICOND-LABEL: usubo.i32.constant.rhs:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: addi a2, a0, 2
-; RV32ZICOND-NEXT: sltu a0, a0, a2
-; RV32ZICOND-NEXT: sw a2, 0(a1)
+; RV32ZICOND-NEXT: addi a2, a0, 1
+; RV32ZICOND-NEXT: seqz a2, a2
+; RV32ZICOND-NEXT: addi a0, a0, 2
+; RV32ZICOND-NEXT: sw a0, 0(a1)
+; RV32ZICOND-NEXT: mv a0, a2
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: usubo.i32.constant.rhs:
@@ -1039,8 +1048,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV32: # %bb.0: # %entry
; RV32-NEXT: li a2, -2
; RV32-NEXT: sub a2, a2, a0
-; RV32-NEXT: addi a0, a2, 1
-; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: sltiu a0, a0, -2
; RV32-NEXT: sw a2, 0(a1)
; RV32-NEXT: ret
;
@@ -1057,8 +1065,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: li a2, -2
; RV32ZBA-NEXT: sub a2, a2, a0
-; RV32ZBA-NEXT: addi a0, a2, 1
-; RV32ZBA-NEXT: seqz a0, a0
+; RV32ZBA-NEXT: sltiu a0, a0, -2
; RV32ZBA-NEXT: sw a2, 0(a1)
; RV32ZBA-NEXT: ret
;
@@ -1075,8 +1082,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV32ZICOND: # %bb.0: # %entry
; RV32ZICOND-NEXT: li a2, -2
; RV32ZICOND-NEXT: sub a2, a2, a0
-; RV32ZICOND-NEXT: addi a0, a2, 1
-; RV32ZICOND-NEXT: seqz a0, a0
+; RV32ZICOND-NEXT: sltiu a0, a0, -2
; RV32ZICOND-NEXT: sw a2, 0(a1)
; RV32ZICOND-NEXT: ret
;
@@ -1116,9 +1122,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64-LABEL: usubo.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: sd a1, 0(a2)
+; RV64-NEXT: sltu a3, a1, a0
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: sd a0, 0(a2)
+; RV64-NEXT: mv a0, a3
; RV64-NEXT: ret
;
; RV32ZBA-LABEL: usubo.i64:
@@ -1140,9 +1147,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZBA-LABEL: usubo.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: sltu a0, a0, a1
-; RV64ZBA-NEXT: sd a1, 0(a2)
+; RV64ZBA-NEXT: sltu a3, a1, a0
+; RV64ZBA-NEXT: sub a0, a0, a1
+; RV64ZBA-NEXT: sd a0, 0(a2)
+; RV64ZBA-NEXT: mv a0, a3
; RV64ZBA-NEXT: ret
;
; RV32ZICOND-LABEL: usubo.i64:
@@ -1163,9 +1171,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZICOND-LABEL: usubo.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: sltu a0, a0, a1
-; RV64ZICOND-NEXT: sd a1, 0(a2)
+; RV64ZICOND-NEXT: sltu a3, a1, a0
+; RV64ZICOND-NEXT: sub a0, a0, a1
+; RV64ZICOND-NEXT: sd a0, 0(a2)
+; RV64ZICOND-NEXT: mv a0, a3
; RV64ZICOND-NEXT: ret
entry:
%t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2810,8 +2819,7 @@ entry:
define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: usubo.select.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a2, a0, a1
-; RV32-NEXT: bltu a0, a2, .LBB40_2
+; RV32-NEXT: bltu a1, a0, .LBB40_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: mv a0, a1
; RV32-NEXT: .LBB40_2: # %entry
@@ -2828,8 +2836,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: usubo.select.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a2, a0, a1
-; RV32ZBA-NEXT: bltu a0, a2, .LBB40_2
+; RV32ZBA-NEXT: bltu a1, a0, .LBB40_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: mv a0, a1
; RV32ZBA-NEXT: .LBB40_2: # %entry
@@ -2846,8 +2853,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: usubo.select.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a2, a0, a1
-; RV32ZICOND-NEXT: sltu a2, a0, a2
+; RV32ZICOND-NEXT: sltu a2, a1, a0
; RV32ZICOND-NEXT: czero.nez a1, a1, a2
; RV32ZICOND-NEXT: czero.eqz a0, a0, a2
; RV32ZICOND-NEXT: or a0, a0, a1
@@ -2871,8 +2877,7 @@ entry:
define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: usubo.not.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: sltu a0, a0, a1
+; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: ret
;
@@ -2885,8 +2890,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: usubo.not.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: sltu a0, a0, a1
+; RV32ZBA-NEXT: sltu a0, a1, a0
; RV32ZBA-NEXT: xori a0, a0, 1
; RV32ZBA-NEXT: ret
;
@@ -2899,8 +2903,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: usubo.not.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: sltu a0, a0, a1
+; RV32ZICOND-NEXT: sltu a0, a1, a0
; RV32ZICOND-NEXT: xori a0, a0, 1
; RV32ZICOND-NEXT: ret
;
@@ -2940,8 +2943,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: usubo.select.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a2, a0, a1
-; RV64-NEXT: bltu a0, a2, .LBB42_2
+; RV64-NEXT: bltu a1, a0, .LBB42_2
; RV64-NEXT: # %bb.1: # %entry
; RV64-NEXT: mv a0, a1
; RV64-NEXT: .LBB42_2: # %entry
@@ -2969,8 +2971,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: usubo.select.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a2, a0, a1
-; RV64ZBA-NEXT: bltu a0, a2, .LBB42_2
+; RV64ZBA-NEXT: bltu a1, a0, .LBB42_2
; RV64ZBA-NEXT: # %bb.1: # %entry
; RV64ZBA-NEXT: mv a0, a1
; RV64ZBA-NEXT: .LBB42_2: # %entry
@@ -2998,8 +2999,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: usubo.select.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a2, a0, a1
-; RV64ZICOND-NEXT: sltu a2, a0, a2
+; RV64ZICOND-NEXT: sltu a2, a1, a0
; RV64ZICOND-NEXT: czero.nez a1, a1, a2
; RV64ZICOND-NEXT: czero.eqz a0, a0, a2
; RV64ZICOND-NEXT: or a0, a0, a1
@@ -3030,8 +3030,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: usubo.not.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
+; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: xori a0, a0, 1
; RV64-NEXT: ret
;
@@ -3053,8 +3052,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: usubo.not.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: sltu a0, a0, a1
+; RV64ZBA-NEXT: sltu a0, a1, a0
; RV64ZBA-NEXT: xori a0, a0, 1
; RV64ZBA-NEXT: ret
;
@@ -3075,8 +3073,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: usubo.not.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: sltu a0, a0, a1
+; RV64ZICOND-NEXT: sltu a0, a1, a0
; RV64ZICOND-NEXT: xori a0, a0, 1
; RV64ZICOND-NEXT: ret
entry:
@@ -4379,8 +4376,7 @@ continue:
define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: usubo.br.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: bgeu a0, a1, .LBB58_2
+; RV32-NEXT: bgeu a1, a0, .LBB58_2
; RV32-NEXT: # %bb.1: # %overflow
; RV32-NEXT: li a0, 0
; RV32-NEXT: ret
@@ -4401,8 +4397,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: usubo.br.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: bgeu a0, a1, .LBB58_2
+; RV32ZBA-NEXT: bgeu a1, a0, .LBB58_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
; RV32ZBA-NEXT: li a0, 0
; RV32ZBA-NEXT: ret
@@ -4423,8 +4418,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: usubo.br.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: bgeu a0, a1, .LBB58_2
+; RV32ZICOND-NEXT: bgeu a1, a0, .LBB58_2
; RV32ZICOND-NEXT: # %bb.1: # %overflow
; RV32ZICOND-NEXT: li a0, 0
; RV32ZICOND-NEXT: ret
@@ -4478,8 +4472,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: usubo.br.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: bgeu a0, a1, .LBB59_2
+; RV64-NEXT: bgeu a1, a0, .LBB59_2
; RV64-NEXT: # %bb.1: # %overflow
; RV64-NEXT: li a0, 0
; RV64-NEXT: ret
@@ -4509,8 +4502,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: usubo.br.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: bgeu a0, a1, .LBB59_2
+; RV64ZBA-NEXT: bgeu a1, a0, .LBB59_2
; RV64ZBA-NEXT: # %bb.1: # %overflow
; RV64ZBA-NEXT: li a0, 0
; RV64ZBA-NEXT: ret
@@ -4540,8 +4532,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: usubo.br.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: bgeu a0, a1, .LBB59_2
+; RV64ZICOND-NEXT: bgeu a1, a0, .LBB59_2
; RV64ZICOND-NEXT: # %bb.1: # %overflow
; RV64ZICOND-NEXT: li a0, 0
; RV64ZICOND-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index 3bbf33328f529..6d5fc765c49a8 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -71,10 +71,10 @@ define i32 @subsat(i32 %a, i32 %b) {
define i32 @subusat(i32 %a, i32 %b) {
; RV32I-LABEL: subusat:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV32IXQCIA-LABEL: subusat:
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index fc73ce5503ffe..da2123a5dfe74 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
; CHECK-LABEL: @extract_value_usub(
; CHECK-NEXT: [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT: [[SUB:%.*]] = xor i8 [[ZZ]], -1
-; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[ZZ]], -1
-; CHECK-NEXT: ret i1 [[R]]
+; CHECK-NEXT: ret i1 false
;
%z = add nuw i8 %zz, 1
%y = add i8 %x, %z
@@ -1090,11 +1090,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
; CHECK-LABEL: @extract_value_usub_fail(
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[Z]]
-; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[Z]], 0
+; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[SUB]], 0
; CHECK-NEXT: ret i1 [[R]]
;
%y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
index 62a332e14b04a..3224b8b63afd3 100644
--- a/llvm/test/Transforms/InstCombine/pr170634.ll
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -3,12 +3,13 @@
define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
; CHECK-LABEL: @func(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
; CHECK: if.then:
; CHECK-NEXT: br label [[RETURN:%.*]]
; CHECK: if.end:
-; CHECK-NEXT: [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
; CHECK-NEXT: br label [[RETURN]]
; CHECK: return:
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index f8b318bc3680a..30a5072c7edc8 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
define i1 @t2(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
define i1 @t2_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
define i1 @t6_commutability(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
define i1 @t8(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
define i1 @t8_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index c9030e5ab0321..90ca39a70a0bb 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,10 +175,11 @@ define i32 @test7(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -204,10 +205,11 @@ define i32 @test8(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -294,10 +296,11 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -325,10 +328,11 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -356,10 +360,11 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -387,10 +392,11 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -418,10 +424,11 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -449,10 +456,11 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index e4b9c0e08ba22..2074190a2cd45 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,9 +130,10 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
; CHECK-NEXT: call void @use(i1 [[OV]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
; CHECK-NEXT: ret i1 [[EQ1]]
;
@@ -148,9 +149,10 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
; CHECK-NEXT: call void @use(i1 [[OV]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
; CHECK-NEXT: ret i1 [[SGT0]]
;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 0c82bdc256ddf..09ef32262ea78 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -506,10 +506,7 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT: [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
-; CHECK-NEXT: [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
+; CHECK-NEXT: [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
; CHECK-NEXT: ret { i32, i1 } [[A]]
;
%a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)
More information about the llvm-commits
mailing list